# Dataset plotting presentation
Tasks:
1. Read the dataset
2. Drop duplicates
3. Drop null values consisting of corresponding rows/columns
4. Replace null values with zero/mean/median/mode
5. Scatter plot with and without hue
6. KDE plot with and without hue
7. Histogram with and without hue
8. Boxplot of all the features
9. Detect outliers with boxplot

### 1. Read the dataset

In [1]:
# import pandas
import pandas as pd

In [2]:
#Read the dataset and store it in a dataframe
df_train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
df_test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'train_u6lujuX_CVtuZ9i.csv'

### Basic Stats
For Train Dataset(df_train)

In [None]:
print("Number of rows: ", df_train.shape[0])
print("Number of columns: ", df_train.shape[1])

In [None]:
print("Number of rows: ", df_test.shape[0])
print("Number of columns: ", df_test.shape[1])

In [None]:
df_train.info()

### 2. Drop duplicates

In [None]:
df_train.Loan_ID.duplicated().sum() #check duplicate for Loan_ID column

In [None]:
df_train.duplicated().sum()

In [None]:
df1_train = df_train.drop(columns=['Loan_ID'])
df1_train.head()

### 3. Drop null values consisting of corresponding rows/columns

In [None]:
df1_train.isnull().sum()

### 4. Replace null values with zero/mean/median/mode

In [None]:
df2_train = df1_train
df2_train['Gender'] = df1_train['Gender'].fillna(df1_train['Gender'].mode()[0])
df2_train['Married'] = df1_train['Married'].fillna(df1_train['Married'].mode()[0])
df2_train['Dependents'] = df1_train['Dependents'].fillna(df1_train['Dependents'].mode()[0])
df2_train['Self_Employed'] = df1_train['Self_Employed'].fillna(df1_train['Self_Employed'].mode()[0])
df2_train['Credit_History'] = df1_train['Credit_History'].fillna(df1_train['Credit_History'].mode()[0])
df2_train.isnull().sum()

In [None]:
df3_train = df2_train
df3_train['LoanAmount'] = df2_train['LoanAmount'].fillna(df2_train['LoanAmount'].median())
df3_train['Loan_Amount_Term'] = df2_train['Loan_Amount_Term'].fillna(df2_train['Loan_Amount_Term'].median())
df3_train.isnull().sum()

### 5. Scatter plot with and without hue

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
df1_plot = df1_mode_median
sns.scatterplot(data=df1_plot, y='ApplicantIncome', x='LoanAmount')

In [None]:
sns.scatterplot(data=df1_plot,y='ApplicantIncome',x='LoanAmount', hue='Loan_Status')

### 6. KDE plot with and without hue

In [None]:
sns.kdeplot(x = 'CoapplicantIncome', data = df1_plot)

In [None]:
sns.kdeplot(x = 'CoapplicantIncome', data = df1_plot, shade = True, hue = 'Loan_Status')

### 7. Histogram with and without hue

In [None]:
df1_plot['Loan_Amount_Term'].unique()

In [None]:
sns.histplot(x = 'Loan_Amount_Term', data = df1_plot);

In [None]:
sns.histplot(x = 'Loan_Amount_Term', data = df1_plot, hue='Loan_Status');

### 8. Boxplot of all the features

In [None]:
sns.boxplot(x= 'LoanAmount', y = 'Gender', data = df1_plot)

In [None]:
sns.boxplot(x= 'CoapplicantIncome', y = 'Married', data = df1_plot)

In [None]:
sns.boxplot(x= 'ApplicantIncome', y = 'Dependents', data = df1_plot)

In [None]:
sns.boxplot(x= 'ApplicantIncome', y = 'Self_Employed', data = df1_plot)

In [None]:
sns.boxplot(x= 'Loan_Amount_Term', y = 'Credit_History', data = df1_plot)

### 9. Detect outliers with boxplot

In [None]:
# implementation of the Tukey's algorithm
# values that look different from the other values in the data
# Box plots are a visual method to identify outliers
def findOutliers(dataset):
    outliers = []
    Q2 = dataset.median()
    Q1 = dataset.describe()[4]
    Q3 = dataset.describe()[6]
    
    Q1 = dataset.quantile(0.25)
    Q3 = dataset.quantile(0.75)
    
    IQR = Q3 - Q1
    lf = Q1 - 1.5*IQR # lf = lower fence
    uf = Q3 + 1.5*IQR # Uf = Upper fence
    for x in dataset:
        if x < lf or x > uf:
            outliers.append(x)
    return outliers

In [None]:
df1_plot.describe()

In [None]:
sns.boxplot(x = 'ApplicantIncome', data = df1_plot)
findOutliers(df1_plot['ApplicantIncome'])

In [None]:
sns.boxplot(x = 'CoapplicantIncome', data = df1_plot)
findOutliers(df1_plot['CoapplicantIncome'])

In [None]:
sns.boxplot(x = 'LoanAmount', data = df1_plot)
findOutliers(df1_plot['LoanAmount'])

In [None]:
sns.boxplot(x = 'Loan_Amount_Term', data = df1_plot)
findOutliers(df1_plot['Loan_Amount_Term'])

In [None]:
df3_train.head()

In [None]:
df3_train.info()

In [None]:
cat_feats = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']

In [None]:
df4_train = pd.get_dummies(data=df3_train, columns=cat_feats ,drop_first=True)

In [None]:
df4_train.head()

In [None]:
df3_1_train = df3_train.drop(columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status'])

In [None]:
df5_train= pd.concat([df3_1_train,df4_train],axis=1)

In [None]:
df5_train.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=600)

In [None]:
x_train = df5_train.drop(columns=['Loan_Status_Y'])
y_train = df5_train['Loan_Status_Y']

In [None]:
rfc.fit(df5_train, y_train)