In [None]:
!pip install numpy xlrd pandas matplotlib seaborn sklearn

In [None]:
#Import your Libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
%matplotlib inline

In [None]:
# %%timeit -n 1
# Load your data  -- start with CreditScoring.csv... then online retail
df = pd.read_csv('./data/REPLACE_WITH_YOUR_FILE')
#  you can also pull from urls like this:   
# df = pd.read_csv('https://raw.githubusercontent.com/fenago/MLEssentials/main/datasets/Life%20Expectancy%20Data.csv')

## Notes

This session covers data collection and some procedures of data preparation. 

**Commands, functions, and methods:** 

* `!wget` - Linux shell command for downloading data 
* `pd.read.csv()` - read csv files 
* `df.head()` - take a look of the dataframe 
* `df.head().T` - take a look of the transposed dataframe 
* `df.columns` - retrieve column names of a dataframe 
* `df.columns.str.lower()` - lowercase all the letters 
* `df.columns.str.replace(' ', '_')` - replace the space separator 
* `df.dtypes` - retrieve data types of all series 
* `df.index` - retrive indices of a dataframe
* `pd.to_numeric()` - convert a series values to numerical values. The `errors=coerce` argument allows making the transformation despite some encountered errors. 
* `df.fillna()` - replace NAs with some value 
* `(df.x == "yes").astype(int)` - convert x series of yes-no values to numerical values.
* `df['Weight'] = df['Weight'].astype(int)` - this takes a single column of data and converts the data type

In [None]:
len(df)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.corr()

In [None]:
# Basic Data Cleaning
df.columns = df.columns.str.lower().str.replace(' ', '_') # A
 
string_columns = list(df.dtypes[df.dtypes == 'object'].index) # B
 
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_') # C

In [None]:
# MAKE SURE THAT YOU WRANGLE YOUR DATA.  THIS IS AN EXAMPLE OF THE TYPES OF THINGS THAT ARE NEEDED
# SKIP THIS CEL - IT IS ONLY TO REITERATE THE NEED TO CLEAN 
# For instance - in the CreditScoring dataset - there are numerous 99999999 that need to be replaced
# Obviously don't run this with your dataset
# for c in ['income', 'assets', 'debt']:
#    df[c] = df[c].replace(to_replace=99999999, value=np.nan)
#df = df[df.status != 'unk']   # Also make sure to treat the target variable

In [None]:
df.head()

In [None]:
df.head().T

### Create Visuals so you can gain a business understanding of your data

In [None]:
# Replace with your target variable --- df.YOUR_TARGET_VARIABLE  
# Look for major data imbalances
# Also replace your X label
# REPLACE YOUR TARGET VARIABLE
plt.figure(figsize=(6, 4))

sns.histplot(df.<replace with your target variable>, bins=40, color='black', alpha=1)
plt.ylabel('Frequency')
plt.xlabel('<replace with your target variable>')
plt.title('PUT A LABEL ON IT')

plt.show()

## Notes
* (1) Check for NaN under a single DataFrame column:

* `df['your column name'].isnull().values.any()`

* (2) Count the NaN under a single DataFrame column:

`df['your column name'].isnull().sum()`

* (3) Check for NaN under an entire DataFrame:

`df.isnull().values.any()`

* (4) Count the NaN under an entire DataFrame:

`df.isnull().sum().sum()`

In [None]:
# Check for nulls --- you do NOT want nulls when you train
df.isnull().sum()

In [None]:
# Check for the percentage of missing values
df.isnull().sum() / df.shape[0] * 100

In [None]:
#check Value Counts
# df."REPLACE WITH FIELD NAME".value_counts()
df.value_counts()

In [None]:
df.head()

In [None]:
#delete columns --- this may or may NOT be needed.  As before - skip if you don't need it
# You will encounter times where you will want to delete columns.  This is how you do that.
# df = df.drop(['x5_latitude', 'x6_longitude', 'x1_transaction_date'], axis=1)
# df

In [None]:
# Split Data
# i.e.:  address = London, UK
# df[['city', 'country']] = df['address'].str.split(',', expand=True)

In [None]:
# Change any Data Types
#Replace Data Types to Integer
# df["Customer Number"] = df['Customer Number'].astype('int')
#Replace Data Types to String
# df["Customer Number"] = df['Customer Number'].astype('str')
#Replace Data Types to Boolean
# df["IsPurchased"] = df['IsPurchased'].astype('bool')
#Replace Data Types to Float
# df["Total Spend"] = df['Total Spend'].astype('float')
#Replace Data Types to Datetime with format= '%Y%m%d'
# df['Dates'] = pd.to_datetime(df['Dates'], format='%Y%m%d')

## Univariate Analysis

In [None]:
## STICK TO CATEGORICAL COLUMNS INITIALLY
#plot the histogram to see the distribution of the point data.
sns.displot(data, x="YOUR_VARIABLE")

In [None]:
sns.countplot(x="YOUR_VARIABLE", data=df)

In [None]:
df['YOUR_VARIABLE'].value_counts()

In [None]:
#measure its skewness and kurtosis
data['YOUR_VARIABLE'].agg(['skew', 'kurtosis']).transpose()

In [None]:
#check for outliers
ax = sns.boxplot(x=data["YOUR_VARIABLE"])

![image info](https://miro.medium.com/max/1400/1*_aN1iaiVUTdoyPbyj-kVjA.jpeg)

## Bivariate Analysis

In [None]:
# Pick 2 variables to compare and replace SEX and DEFAULT WITH THE TWO VARIABLES
# Stick with Categorical variables for now
sns.set(rc={'figure.figsize':(15,10)})
edu = sns.countplot(x='SEX', hue='DEFAULT', data=df)
edu.set_xticklabels(['Male','Female'])
plt.show()

In [None]:
# Evaluate the Cross Tab
pd.crosstab(df.SEX,df.DEFAULT,normalize='index',margins=True)

## Correlation

In [None]:
# Pearson Correlation
corrMatrix = df.corr()
sns.heatmap(corrMatrix, annot = True, cmap= 'coolwarm')

In [None]:
# Spearman Correlation
sns.set(rc={'figure.figsize':(30,10)})
sns.set_context("talk", font_scale=0.7)

In [None]:
sns.heatmap(df.iloc[:,1:].corr(method='spearman'), cmap='rainbow_r', annot=True)

In [None]:
# To get the Correlation between your variable of interest and the rest of the variables
# - replace "DEFAULT" with your variable of interest.
df.drop("DEFAULT", axis=1).apply(lambda x: x.corr(df.DEFAULT,method='spearman'))