In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import roc_auc_score

from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor

import warnings 
warnings.filterwarnings('ignore')
import os 
    
# Data View
pd.options.display.max_columns = 200


# Import Basic Visualization
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline
    
# Data Visualization : Plotly library 
import plotly.express as px
    
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()
from plotly.subplots import make_subplots      



ModuleNotFoundError: No module named 'plotly'

In [None]:
train = pd.read_csv("data/Training Data.csv")

In [None]:
train

In [None]:
train.describe()

In [None]:
Quantitive_features = train.select_dtypes([np.number]).columns
Quantitive_features = [col for col in Quantitive_features if col not in ['Id', 'Risk_Flag']]

Discrete_features = [col for col in Quantitive_features if len(train[col].unique()) < 10]

Continuous_features = [col for col in Quantitive_features if col not in Discrete_features]

Categorical_features = train.select_dtypes(exclude = [np.number]).columns

df_Target = train['Risk_Flag']

In [None]:
print("Quantitive feautres : {} \n Discrete features : {} \n Continous features : {} \n Categorical features : {}"
     .format(Quantitive_features, Discrete_features, Continuous_features, Categorical_features))

In [None]:
Current_risk_pec = train.Risk_Flag.mean()*100
print("Current risk percent of dataset : {}%".format(Current_risk_pec))

In [None]:
def Categorical_Features_Univarate(df, fea) : 
    length = len(df[fea].value_counts().keys())
    colors = px.colors.sequential.RdBu[:length]
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x = df[fea].value_counts(),
        y = df[fea].value_counts().keys(),
        orientation = 'h',
        marker_color = colors))
    fig.show()

In [None]:
Categorical_Features_Univarate(train, "STATE")

In [None]:
Categorical_Features_Univarate(train, 'CURRENT_HOUSE_YRS')

In [None]:
train['Profession'].value_counts().head()

In [None]:
train['CITY'].value_counts().head()

In [None]:
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr = corr.mask(mask)

fig = go.Figure()
fig.add_trace(go.Heatmap(
    z = corr, 
    x = corr.columns.tolist(),
    y = corr.columns.tolist(),
    colorscale = 'RdBu',
    xgap = 1,
    ygap = 1,
    hoverinfo = "none"
    )
)

fig.update_layout(
    {
        "title": {
            "text": "<b>Correlation in each feautures</b>",
            "x": 0.5,
            "y": 0.9,
            "font": {
                "size": 15
            }
        },
        "xaxis": {
            "title": "Columns",
            "tickfont": {
                "size": 8                
            }
        },
        "yaxis": {
            "title": "Columns",
            "tickfont": {
                "size": 8                
            }
        },
        "template":'plotly_white',
        "yaxis_autorange" : "reversed"
    }
)
fig.show()


In [None]:
df_city = train.groupby('CITY')['Risk_Flag'].mean().sort_values(ascending = False).reset_index()
df_city['Risk_Flag'] = round(df_city['Risk_Flag']*100, 2)

In [None]:
df_city_plot = pd.concat([df_city[:10],df_city[-10:]]).reset_index(drop = True)

In [None]:
colors = px.colors.sequential.RdBu[:len(df_city_plot)]
fig = go.Figure()
fig.add_trace(go.Bar(
    x = df_city_plot.CITY,
    y = df_city_plot.Risk_Flag,
    text = df_city_plot.Risk_Flag,
    texttemplate = "%{text}%",
    marker_color = colors  ))

fig.update_layout(
    {
        "title": {
            "text": "<b>Risk Percentage by CITY</b>",
            "x": 0.5,
            "y": 0.9,
            "font": {
                "size": 15
            }
        },
        "xaxis": {
            "title": "CITY",
            "tickfont": {
                "size": 10                
            }
        },
        "yaxis": {
            "title": "Risk Percentage",
            "tickfont": {
                "size": 10                
            }
        },
        "template":'plotly_white'
    }
)

fig.show()

In [None]:
train['Profession'].unique()

In [None]:
# science: Microbiologist, Technician, Technology_specialist, Technical_writer, Scientist, Statistician, Geologist
# engineering: Mechanical_engineer, Civil_engineer, Petroleum_Engineer, Software_Developer, Biomedical_Engineer, Computer_hardware_engineer', Computer_operator, Chemical_engineer,Industrial_Engineer', Engineer, Web_designer
# social and cultural: Flight_attendant, Air_traffic_controller, Firefighter, Army_officer, Civil_servant, Librarian, Official, Comedian,Drafter, Politician, Secretary, Magistrate, Psychologist,Lawyer
# art: Graphic_Designer, Designer,Design_Engineer, Fashion_Designer, Artist
# Business: Economist, Hotel_Manager, Consultant, Chartered_Accountant, Analyst
# Professional: Architect', Aviator, Chef,'Physician, Dentist, Surgeon

In [None]:
Science = ['Microbiologist', 'Technician', 'Technology_specialist', 'Technical_writer', 'Scientist', 'Statistician', 'Geologist']
Engineering = ['Mechanical_engineer', 'Civil_engineer', 'Petroleum_Engineer', 'Software_Developer', 'Biomedical_Engineer', 'Computer_hardware_engineer', 'Computer_operator', 'Chemical_engineer','Industrial_Engineer', 'Engineer', 'Web_designer']
Social = ['Flight_attendant', 'Air_traffic_controller', 'Firefighter','Army_officer', 'Police_officer','Civil_servant', 'Librarian', 'Official', 'Comedian','Drafter', 'Politician', 'Secretary', 'Magistrate', 'Psychologist','Lawyer']
Art = ['Graphic_Designer', 'Designer','Design_Engineer', 'Artist','Fashion_Designer']
Business = ['Economist','Financial_Analyst', 'Hotel_Manager', 'Consultant', 'Chartered_Accountant', 'Analyst']
Professional = ['Architect','Aviator','Chef','Physician', 'Dentist', 'Surveyor','Surgeon']

# def get_prof_group(x):
#     if train['Profession'] in Science:
#         return 'Science'
#     if train['Profession'] in Engineering:
#         return 'Engineering'
#     if train['Profession'] in Art:
#         return 'Art'
#     if train['Profession'] in Business:
#         return 'Business'
#     if train['Profession'] in Professional:
#         return 'Professional'

# train.loc[:,'Prof_Group'] = train.apply(get_prof_group, axis = 1)


        


In [None]:
# for i in train['Profession']:
#     if i in Science:
#         train['Prof_Group'][i] = 'Science'
#     if i in Engineering:
#         train['Prof_Group'][i] = 'Engineering'
#     if i in Art:
#         train['Prof_Group'][i] = 'Art'
#     if i in Business:
#         train['Prof_Group'][i] = 'Business'
#     if i in Professional:
#         train['Prof_Group'][i] = 'Professional'

In [None]:
for i in Science:
    row1 = (train['Profession'] == i)
    train.loc[row1, 'Prof_Group'] = 'Science'
for i in Engineering:
    row1 = (train['Profession'] == i)
    train.loc[row1, 'Prof_Group'] = 'Engineering'
for i in Art:
    row1 = (train['Profession'] == i)
    train.loc[row1, 'Prof_Group'] = 'Art'
for i in Business:
    row1 = (train['Profession'] == i)
    train.loc[row1, 'Prof_Group'] = 'Business'
for i in Professional:
    row1 = (train['Profession'] == i)
    train.loc[row1, 'Prof_Group'] = 'Professional'
for i in Social:
    row1 = (train['Profession'] == i)
    train.loc[row1, 'Prof_Group'] = 'Social'

In [None]:
df_prof = train.groupby('Prof_Group')['Risk_Flag'].mean().sort_values(ascending = False).reset_index()
df_prof['Risk_Flag'] = round(df_prof['Risk_Flag']*100, 2)

In [None]:
df_prof_plot = pd.concat([df_prof[:10],df_city[-10:]]).reset_index(drop = True)

In [None]:
colors = px.colors.sequential.RdBu[:len(df_prof_plot)]
fig = go.Figure()
fig.add_trace(go.Bar(
    x = df_prof_plot.Prof_Group,
    y = df_prof_plot.Risk_Flag,
    text = df_prof_plot.Risk_Flag,
    texttemplate = "%{text}%",
    marker_color = colors  ))

fig.update_layout(
    {
        "title": {
            "text": "<b>Risk Percentage by Profssion Group</b>",
            "x": 0.5,
            "y": 0.9,
            "font": {
                "size": 15
            }
        },
        "xaxis": {
            "title": "Profession Group",
            "tickfont": {
                "size": 10                
            }
        },
        "yaxis": {
            "title": "Risk Percentage",
            "tickfont": {
                "size": 10                
            }
        },
        "template":'plotly_white'
    }
)

fig.show()

In [None]:
train.loc[train['Prof_Group'] == ''] 

In [None]:
def get_age_group(x):
    if x['Age'] < 30:
        return "Young"
    elif x['Age'] < 50:
        return "Middle Age"
    else:
        return "Elder"

train.loc[:, 'Age_group'] = train.apply(get_age_group, axis=1)

In [None]:
df_age = train.groupby('Age_group')['Risk_Flag'].mean().sort_values(ascending = False).reset_index()
df_age['Risk_Flag'] = round(df_age['Risk_Flag']*100, 2)
df_age_plot = pd.concat([df_age[:1],df_age[-2:]]).reset_index(drop = True)

In [None]:
df_age['Risk_Flag']

In [None]:
df_age_plot['Risk_Flag']

In [None]:
colors = px.colors.sequential.RdBu[:len(df_age_plot)]
fig = go.Figure()
fig.add_trace(go.Bar(
    x = df_age_plot.Age_group,
    y = df_age_plot.Risk_Flag,
    text = df_age_plot.Risk_Flag,
    texttemplate = "%{text}%",
    marker_color = colors  ))
fig.show()

In [None]:
colors = px.colors.sequential.RdBu[:len(df_age_plot)]
fig = go.Figure()
fig.add_trace(go.Bar(
    x = df_age_plot.Age_group,
    y = df_age_plot.Risk_Flag,
    text = df_age_plot.Risk_Flag,
    texttemplate = "%{text}%",
    marker_color = colors  ))

fig.update_layout(
    {
        "title": {
            "text": "<b>Risk Percentage by Age Group</b>",
            "x": 0.5,
            "y": 0.9,
            "font": {
                "size": 15
            }
        },
        "xaxis": {
            "title": "Age Group",
            "tickfont": {
                "size": 10                
            }
        },
        "yaxis": {
            "title": "Risk Percentage",
            "tickfont": {
                "size": 10                
            }
        },
        "template":'plotly_white'
    }
)

fig.show()