In [32]:
import pandas as pd 
import numpy as np

In [33]:
train=pd.read_csv("/content/drive/MyDrive/univ.ai/univ_data/Training Data.csv")

In [34]:
train.risk_flag.value_counts()

0    221004
1     30996
Name: risk_flag, dtype: int64

In [35]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252000 entries, 0 to 251999
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Id                   252000 non-null  int64 
 1   income               252000 non-null  int64 
 2   age                  252000 non-null  int64 
 3   experience           252000 non-null  int64 
 4   married              252000 non-null  object
 5   house_ownership      252000 non-null  object
 6   car_ownership        252000 non-null  object
 7   profession           252000 non-null  object
 8   city                 252000 non-null  object
 9   state                252000 non-null  object
 10  current_job_years    252000 non-null  int64 
 11  current_house_years  252000 non-null  int64 
 12  risk_flag            252000 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 25.0+ MB


In [36]:
train.isnull().sum()

Id                     0
income                 0
age                    0
experience             0
married                0
house_ownership        0
car_ownership          0
profession             0
city                   0
state                  0
current_job_years      0
current_house_years    0
risk_flag              0
dtype: int64

In [37]:
train["age_brackets"]=pd.qcut(train["age"],q=[0,0.4,0.8,1],precision=0)
train["income_brackets"]=pd.qcut(train["income"],q=[0,0.4,0.8,1],precision=0)

In [38]:
train.drop(["Id","profession","city","state","age","income"],axis=1,inplace=True)

In [39]:
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [40]:
married_transform=Pipeline(steps=[
    ('ordinal_encoder',OrdinalEncoder()),
    ])

car_owner_transform=Pipeline(steps=[
    ('ordinal_encoder',OrdinalEncoder()),
    ])

ohe_transform=Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

numeric_features = train.drop("risk_flag",axis=1).select_dtypes(include=['int64', 'float64']).columns

scaler_transform = Pipeline(steps=[
    ('scaler', StandardScaler())
    ])

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ('married_oe',married_transform,["married"]),
        ('car_owner_oe',car_owner_transform,["car_ownership"]),
        ('house_owner_ohe',ohe_transform,["car_ownership","age_brackets","income_brackets"]),
        ('scaler',scaler_transform,numeric_features),
    ])

In [42]:
X = train.drop('risk_flag', axis=1)
y = train['risk_flag']

In [50]:
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X,y)
X_over=pd.DataFrame(X_over,columns=X.columns)



In [None]:
def create_Xy(X,y,oversample=False,test_size=0.2):
  if oversample:
    from imblearn.over_sampling import RandomOverSampler
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,shuffle=True)
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_over, y_over = oversample.fit_resample(X_train, y_train)
    X_over=pd.DataFrame(X_over,columns=X.columns)
    return X_over, X_test, y_over,y_test
  else:
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test=create_Xy(X,y,oversample=True,test_size=0.1)
print(X_train.shape,X_test.shape)



(397964, 9) (25200, 9)


In [None]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', classifier)])
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipe, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = np.mean(scores)
    print(classifier)
    print('Roc Auc: %.3f' % score)

    pipe.fit(X_train,y_train)
    y_pred_test=pipe.predict(X_test)
    auc_test = roc_auc_score(y_test, y_pred_test)
    print('ROC AUC test: %f' % auc_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
Roc Auc: 0.947
ROC AUC test: 0.663548
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
Roc Auc: 0.942
ROC AUC test: 0.558993




RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Roc Auc: 0.992
ROC AUC test: 0.610897
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)
Roc Auc: 0.664
ROC AUC test: 0.622824
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                    

In [None]:
train.groupby(by=["profession"])["income"].min().sort_values()

profession
Engineer                      10310
Graphic_Designer              10675
Surveyor                      11114
Civil_servant                 11430
Analyst                       11550
Flight_attendant              11695
Chef                          11735
Technical_writer              11799
Architect                     11854
Physician                     12608
Fashion_Designer              12766
Microbiologist                12825
Designer                      12841
Army_officer                  12902
Police_officer                13220
Computer_hardware_engineer    13357
Comedian                      13429
Official                      13629
Scientist                     13792
Air_traffic_controller        14339
Drafter                       14936
Chartered_Accountant          15613
Technician                    16045
Magistrate                    16212
Economist                     16890
Politician                    16909
Technology_specialist         17496
Financial_Analyst

In [None]:
train.corr()["current_job_years"]

Id                    -0.003250
income                 0.007045
age                    0.002154
experience             0.646098
current_job_years      1.000000
current_house_years    0.005372
risk_flag             -0.016942
Name: current_job_years, dtype: float64

In [None]:
train[["income","current_job_years","risk_flag"]].sort_values(by="income")

Unnamed: 0,income,current_job_years,risk_flag
146007,10310,7,0
27151,10310,7,0
81247,10310,7,0
75837,10310,7,0
200026,10310,7,0
...,...,...,...
140572,9999180,0,0
108199,9999180,0,0
172947,9999180,0,0
225535,9999400,10,0


In [None]:
train_risk=train[train["risk_flag"]==1]

In [None]:
train_risk.age_brackets.value_counts()

(20.0, 44.0]    13103
(44.0, 68.0]    12468
(68.0, 79.0]     5425
Name: age_brackets, dtype: int64

In [None]:
train_risk.income_brackets.value_counts()

(10309.0, 3983449.0]      12450
(3983449.0, 7957694.0]    12161
(7957694.0, 9999938.0]     6385
Name: income_brackets, dtype: int64

# test

In [57]:
test=pd.read_csv("/content/drive/MyDrive/univ.ai/univ_data/Test Data.csv")
ID=test["id"]

In [46]:
test["age_brackets"]=pd.qcut(test["age"],q=[0,0.4,0.8,1],precision=0)
test["income_brackets"]=pd.qcut(test["income"],q=[0,0.4,0.8,1],precision=0)
test.drop(["id","profession","city","state","age","income"],axis=1,inplace=True)

In [47]:
display(X.head(),test.head())

Unnamed: 0,experience,married,house_ownership,car_ownership,current_job_years,current_house_years,age_brackets,income_brackets
0,3,single,rented,no,3,13,"(20.0, 44.0]","(10309.0, 3983449.0]"
1,10,single,rented,no,9,13,"(20.0, 44.0]","(3983449.0, 7957694.0]"
2,4,married,rented,no,4,10,"(44.0, 68.0]","(3983449.0, 7957694.0]"
3,2,single,rented,yes,2,12,"(20.0, 44.0]","(3983449.0, 7957694.0]"
4,11,single,rented,no,3,14,"(44.0, 68.0]","(3983449.0, 7957694.0]"


Unnamed: 0,experience,married,house_ownership,car_ownership,current_job_years,current_house_years,age_brackets,income_brackets
0,19,single,rented,no,4,13,"(44.0, 68.0]","(4040293.0, 7961910.0]"
1,5,single,rented,no,5,10,"(20.0, 44.0]","(10309.0, 4040293.0]"
2,12,single,rented,no,9,14,"(44.0, 68.0]","(7961910.0, 9999814.0]"
3,9,married,rented,yes,3,12,"(44.0, 68.0]","(10309.0, 4040293.0]"
4,18,single,rented,yes,13,11,"(20.0, 44.0]","(10309.0, 4040293.0]"


In [51]:
x_train=preprocessor.fit_transform(X_over)

In [52]:
x_test=preprocessor.transform(test)

In [53]:
print(x_train.shape,y_over.shape)

(442008, 13) (442008,)


In [54]:
from sklearn.ensemble import GradientBoostingClassifier
classifier=GradientBoostingClassifier()
classifier.fit(x_train,y_over)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [55]:
predictions=classifier.predict(x_test)

In [58]:
result=pd.DataFrame()
result["id"]=ID
result["risk_flag"]=predictions

In [59]:
result.risk_flag.value_counts()

0    16124
1    11876
Name: risk_flag, dtype: int64

In [62]:
result.to_csv("prediction.csv",index=False)