# Exploratory Data Analysis Demo


## Weather Data

This dataset contains one year of daily observations from a weather station in Canberra, Australia.  It is a derived from the weather dataset from the R 'rattle' package:  https://cran.r-project.org/web/packages/rattle.data/rattle.data.pdf


## Read in Data

In [None]:
import pandas as pd
import os
import requests

In [None]:
thePath = './'  # Adjust this path as necessary
theFilename = 'weather-orig.csv'
theLink = "https://dse200.dev/Day3/weather-orig.csv"

if not os.path.exists(thePath + theFilename):
    r = requests.get(theLink)
    with open(thePath + theFilename, 'wb') as f:
        f.write(r.content)

theFile = 'SalesAds.csv'
df= pd.read_csv(thePath + theFile)
print(f"shape {df.shape}")


In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print(df['Date'])

In [None]:
# Convert Date to datetime data type
df['Date'] = pd.to_datetime(df['Date'])
print(df['Date'])

## Examine rows

In [None]:
df.head(10)

In [None]:
df.tail()

## Examine columns

In [None]:
df.shape

In [None]:
print('Dimensions:', df.shape, '\n')

In [None]:
df.columns

In [None]:
print('Columns:\n', list(df.columns))

In [None]:
print('Dimensions:', df.shape, '\n')
print('Columns:\n', list(df.columns))

## Get Summary Statistics

### Summary statistics for entire dataset

In [None]:
# NOTE:  Summary statistics apply to numerical columns only.
# T transposes displayed results
df.describe().T

### Summary statistics for single feature

In [None]:
feature = "WindGustSpeed"

print("Summary statistics on", feature)
print("========")
print("average: ", df[feature].mean())
print("std: ", df[feature].std())
print("max: ", df[feature].max())
print("min: ", df[feature].min())
print("count: ", df[feature].count())
print("median: ", df[feature].median())
print("========")

In [None]:
df[feature].describe()

In [None]:
print("The summary statistics on", feature, ": ")
print("The summary statistics on " + feature + ": ")
print(f"The summary statistics on {feature}: ")

### Other summary statistics

In [None]:
print("first date recorded: ", df['Date'].min())
print("last date recorded: ", df['Date'].max())
print("total rainfall amount for year: ", df['Rainfall'].sum())
print("average rainfall daily: ", df['Rainfall'].sum() / df['Rainfall'].count())

In [None]:
print(df.Date)

### Dataframe info

In [None]:
df.dtypes

In [None]:
df.info()

## Handling Missing Values

### Determine features with missing values

In [None]:
print("NAs for each feature:")
df.isna().sum()

In [None]:
# Features without missing values
list(df.columns[df.isna().sum() == 0])

### Remove rows with missing values

In [None]:
print("Dimensions before removing null values: ", df.shape)

df.dropna(inplace=True)   # NOTE: Original df is overwritten
print("Dimensions after removing null values:  ", df.shape)

In [None]:
df.describe().round(2).T

### Save data without missing values to new file

In [None]:
# Save  data in csv file format, not including index as a column
df.to_csv("weather-na-omit.csv", index=False)

## Explore by Data Types

In [None]:
df.info()

### Explore numerical features
- int64
- float64

In [None]:
print("Numeric features: \n", list(df.select_dtypes(include=['float64', 'int64'])))

In [None]:
# Select only numerical features
df_num = df.select_dtypes(include=['float64', 'int64'])
df_num.head()

In [None]:
# Correlation between two features
df_num['Evaporation'].corr(df_num['Temp9am'])

In [None]:
# Pair-wise correlation
df_num.corr()

### Explore categorical features

In [None]:
# Print values (i.e., categories) for categorical feature
feature = "WindDir3pm"
df[feature] = df[feature].astype('category')
print("Values for ", feature, "\n", list(df[feature].cat.categories))

In [None]:
# Get counts of unique values
df[feature].value_counts()

In [None]:
# Get frequency of unique values
(df[feature].value_counts() / sum(df[feature].value_counts())).round(4)

# Data Visualization

In [None]:
# Have plots display in notebook
%matplotlib inline

from matplotlib import pyplot as plt
import seaborn as sns

## Histogram

In [None]:
df['Sunshine'].describe()

### Using matplotlib

In [None]:
plt.hist(df['Sunshine'], bins=10)
plt.title("Histogram of Daily Sunshine")
plt.xlabel("Sunshine (Hours)")
plt.ylabel("Frequency")
plt.show()

### Using pandas (with matplotlib as backend)

In [None]:
df['Sunshine'].plot.hist()
plt.title("Histogram of Daily Sunshine")
plt.xlabel('Sunshine (Hours)')
plt.ylabel("Frequency")
plt.show()

## Density Plot

### Using pandas/matplotlib

In [None]:
df['Sunshine'].plot.kde()
plt.title("Density Plot of Daily Sunshine")
plt.xlabel("Sunshine (Hours)")
plt.ylabel("Density")
plt.show()

### Using seaborn

In [None]:
sns.histplot(df['Sunshine'], kde=True, bins=30)
plt.title("Histogram of Daily Sunshine")
plt.xlabel('Sunshine (Hours)')
plt.ylabel("Frequency")
plt.show()

## Bar plots for categorical features

### Using matplotlib

In [None]:
df['WindGustDir'].unique()

In [None]:
# Order values based on direction instead (N, E, S, W)
x_data = ['N','NNE','NE','ENE','E','ESE','SE','SSE','S','SSW','SW','WSW','W','WNW','NW','NNW']
freq_data = df['WindGustDir'].value_counts()
# x_data = ['N','NNW','NW','WNW','W','WSW','SW','SSW','S','SSE','SE','ESE','E','ENE','NE','NNE']
y_data = [freq_data[k] for k in x_data]

In [None]:
plt.figure(figsize=(10, 5))
plt.bar(x=x_data, height=y_data)
plt.title("Distribution of Wind Gust Direction")
plt.xlabel("Frequency")
plt.ylabel("Wind Gust Direction")
plt.show()

### Using Seaborne

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x=x_data, y=y_data, color="skyblue")
plt.title("Distribution of Wind Gust Direction")
plt.xlabel("Frequency")
plt.ylabel("Wind Gust Direction")
plt.show()

## Scatter Plot

In [None]:
plt.scatter(x=df['Evaporation'], y=df['Temp9am'])
plt.title('Evaporation vs. Temp9am')
plt.xlabel('Evaporation')
plt.ylabel('Temp9am')
plt.show()

In [None]:
sns.scatterplot(x='Evaporation', y='Temp9am', data=df)
plt.show()

In [None]:
sns.regplot(x='Evaporation', y='Temp9am', data=df, marker='+')
plt.show()

## Line Plot

In [None]:
plt.figure(figsize=(10,6))
plt.plot(df['Date'], df['MaxTemp'])
plt.title('Daily High Temperature')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(x='Date', y='MaxTemp', data=df)
plt.title('Daily High Temperature')
plt.show()

## Box Plot

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='RainToday', y='Pressure9am', data=df)
plt.title('Atmospheric Pressure wrt Rain')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x=pd.DatetimeIndex(df['Date']).month, y='MaxTemp', data=df)
plt.title('High Temperature by Month')
plt.xlabel('Month')
plt.show()

## Violin Plot

In [None]:
plt.figure(figsize=(10,6))
sns.violinplot(x=pd.DatetimeIndex(df['Date']).month, y='MaxTemp', data=df)
plt.title('High Temperature by Month')
plt.show()

## Heat Map

In [None]:
plt.figure(figsize=(10,8))
plt.title('Correlation Heatmap')
sns.heatmap(df_num.corr(),annot=False, fmt="f", cmap="YlGnBu", linewidths=0.5)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df_num[['Sunshine', 'Pressure9am', 'Humidity9am', 'Temp9am']].corr(),
            annot=True, fmt=".4f", cmap="YlGnBu", linewidths=4.5)
plt.title('Pairwise Correlation')
plt.show()