In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import os

In [51]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
train.columns

Index(['Complaint-ID', 'Date-received', 'Transaction-Type', 'Complaint-reason',
       'Company-response', 'Date-sent-to-company', 'Complaint-Status',
       'Consumer-disputes', 'Consumer-complaint-summary'],
      dtype='object')

In [52]:
y = train['Complaint-Status']


In [53]:
df_null = train.isnull().sum().sort_values(ascending=False)
df_null_percent = (train.isnull().sum() / train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([df_null,df_null_percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(9)

Unnamed: 0,Total,Percent
Company-response,22506,0.520178
Consumer-disputes,7698,0.177923
Consumer-complaint-summary,0,0.0
Complaint-Status,0,0.0
Date-sent-to-company,0,0.0
Complaint-reason,0,0.0
Transaction-Type,0,0.0
Date-received,0,0.0
Complaint-ID,0,0.0


In [54]:
df_null = test.isnull().sum().sort_values(ascending=False)
df_null_percent = (test.isnull().sum() / test.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([df_null,df_null_percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(9)

Unnamed: 0,Total,Percent
Company-response,9701,0.523162
Consumer-disputes,3304,0.17818
Consumer-complaint-summary,0,0.0
Date-sent-to-company,0,0.0
Complaint-reason,0,0.0
Transaction-Type,0,0.0
Date-received,0,0.0
Complaint-ID,0,0.0


In [55]:
# train['Consumer-disputes'].head()
# train['Consumer-disputes'] = train['Consumer-disputes'].map(dict(Yes=1, No=0))
# train['Consumer-disputes'].value_counts()

# # train.groupby('Consumer-disputes').nunique()
# train['Consumer-disputes'].fillna(0, inplace=True)

In [56]:
train.groupby('Complaint-Status').nunique()

Unnamed: 0_level_0,Complaint-ID,Date-received,Transaction-Type,Complaint-reason,Company-response,Date-sent-to-company,Complaint-Status,Consumer-disputes,Consumer-complaint-summary
Complaint-Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Closed,809,458,9,46,9,451,1,2,809
Closed with explanation,34300,920,18,149,10,930,1,2,34121
Closed with monetary relief,2818,820,17,116,9,804,1,2,2818
Closed with non-monetary relief,5018,889,17,118,10,883,1,2,4999
Untimely response,321,257,12,48,0,256,1,1,321


In [57]:
# train['Company-response'].isnull().sum()
train.drop('Company-response', axis=1, inplace=True)
train.drop('Complaint-Status', axis=1, inplace=True)
test.drop('Company-response', axis=1, inplace=True)

# train.columns

In [58]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


X = pd.DataFrame(train[['Consumer-disputes']])
te_x = pd.DataFrame(train[['Consumer-disputes']])
train[['Consumer-disputes']] = DataFrameImputer().fit_transform(X)
test[['Consumer-disputes']] = DataFrameImputer().fit_transform(te_x)

In [59]:
train['Date-received'] = pd.to_datetime(train['Date-received'])
train['Date-sent-to-company'] = pd.to_datetime(train['Date-sent-to-company'])
train.dtypes

Complaint-ID                          object
Date-received                 datetime64[ns]
Transaction-Type                      object
Complaint-reason                      object
Date-sent-to-company          datetime64[ns]
Consumer-disputes                     object
Consumer-complaint-summary            object
dtype: object

In [60]:
test['Date-received'] = pd.to_datetime(test['Date-received'])
test['Date-sent-to-company'] = pd.to_datetime(test['Date-sent-to-company'])
test.dtypes

Complaint-ID                          object
Date-received                 datetime64[ns]
Transaction-Type                      object
Complaint-reason                      object
Date-sent-to-company          datetime64[ns]
Consumer-disputes                     object
Consumer-complaint-summary            object
dtype: object

In [62]:
train['days_taken'] = (train['Date-sent-to-company'] - train['Date-received']).dt.days
test['days_taken'] = (test['Date-sent-to-company'] - test['Date-received']).dt.days
test.head()

Unnamed: 0,Complaint-ID,Date-received,Transaction-Type,Complaint-reason,Date-sent-to-company,Consumer-disputes,Consumer-complaint-summary,days_taken
0,Te-1,2016-08-18,Bank account or service,"Account opening, closing, or management",2016-08-18,Yes,XXXX / XXXX / 16 I called Citibank to open a c...,0
1,Te-2,2016-04-18,Debt collection,Communication tactics,2016-04-20,No,I'm struggling financially. I called and I off...,2
2,Te-3,2016-03-23,Credit reporting,Incorrect information on credit report,2016-03-23,No,"In XXXX of 2015, an automatic payment was conf...",0
3,Te-4,2017-06-26,Student loan,Dealing with your lender or servicer,2017-06-26,No,"I submitted a request to XXXX, which is my cur...",0
4,Te-5,2016-05-13,Credit reporting,Incorrect information on credit report,2016-05-13,No,A state tax lien was filed against me XXXX / X...,0


In [63]:
train.drop('Date-sent-to-company', axis=1, inplace=True)
test.drop('Date-sent-to-company', axis=1, inplace=True)
train.drop('Complaint-ID', axis=1, inplace=True)
test.drop('Complaint-ID', axis=1, inplace=True)


In [64]:
from collections import defaultdict
d = defaultdict(LabelEncoder)

fit = train.apply(lambda x: d[x.name].fit_transform(x))
fit_test = test.apply(lambda x: d[x.name].fit_transform(x))
fit.head()

Unnamed: 0,Date-received,Transaction-Type,Complaint-reason,Consumer-disputes,Consumer-complaint-summary,days_taken
0,237,10,78,1,32712,0
1,110,5,71,0,39918,0
2,49,0,145,0,5429,0
3,604,7,36,0,35360,0
4,560,3,100,0,22874,0


In [66]:
# fit.drop('Complaint-ID', axis=1, inplace=True)
# fit_test.drop('Complaint-ID', axis=1, inplace=True)
fit.columns

Index(['Date-received', 'Transaction-Type', 'Complaint-reason',
       'Consumer-disputes', 'Consumer-complaint-summary', 'days_taken'],
      dtype='object')

In [67]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
fit_std = stdsc.fit_transform(fit)
fit_std = pd.DataFrame(fit_std)
fit_test_std = stdsc.fit_transform(fit_test)
fit_test_std = pd.DataFrame(fit_test_std)
fit_std.head()

Unnamed: 0,0,1,2,3,4,5
0,-0.95529,0.893237,0.378599,2.115005,0.902535,-0.257772
1,-1.446752,-0.422784,0.179011,-0.472812,1.482746,-0.257772
2,-1.682809,-1.738804,2.288944,-0.472812,-1.294234,-0.257772
3,0.464918,0.103625,-0.818931,-0.472812,1.115746,-0.257772
4,0.294648,-0.949192,1.005877,-0.472812,0.1104,-0.257772


In [68]:
fit_std.columns = [ 'Date-received', 'Transaction-Type', 'Complaint-reason','Date-sent-to-company','Consumer-disputes', 'Consumer-complaint-summary']
fit_test_std.columns = [ 'Date-received', 'Transaction-Type', 'Complaint-reason','Date-sent-to-company','Consumer-disputes', 'Consumer-complaint-summary']

In [69]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(fit,y, test_size=0.3, random_state=1 )

In [70]:
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
feat_label = fit.columns[:]
forest = RandomForestClassifier(n_estimators=1400, max_depth=12, min_samples_leaf=4, max_features=0.5, n_jobs=-1, random_state=0)
forest.fit(X_train,y_train) #in the random forest we dont need to standardize the data
importances = forest.feature_importances_
indeces = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print(feat_label[indeces[f]], importances[indeces[f]])



numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.



Date-received 0.257606580559895
Consumer-complaint-summary 0.2491612287142967
Transaction-Type 0.2286798633144994
Complaint-reason 0.1512995701312392
days_taken 0.07287341540350238
Consumer-disputes 0.04037934187656685


In [71]:
y_pred = forest.predict(X_test)

In [72]:
random_cm = confusion_matrix(y_test,y_pred)
random_cm


array([[    1,   248,     1,     0,     0],
       [    0, 10182,    43,    25,     0],
       [    0,   814,    56,     2,     0],
       [    0,  1475,    16,    30,     0],
       [    0,    87,     0,     0,     0]])

In [73]:
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
print(accuracy_score(y_test,y_pred))

0.7911402157164868


In [74]:
print(len(y_pred))

12980


In [75]:
y_test = forest.predict(fit_test)

In [76]:
predicted = pd.DataFrame(y_test)
predicted.head()

Unnamed: 0,0
0,Closed with explanation
1,Closed with explanation
2,Closed with explanation
3,Closed with explanation
4,Closed with explanation


In [77]:
predicted.size

18543

In [78]:
predi = predicted.reset_index()
predi.columns = ['Complaint-ID', 'Complaint-Status' ]
predi['Complaint-ID'] = predi.index +1
predi['Complaint-ID'] = 'Te-' + predi['Complaint-ID'].astype(str)
predi.head()

Unnamed: 0,Complaint-ID,Complaint-Status
0,Te-1,Closed with explanation
1,Te-2,Closed with explanation
2,Te-3,Closed with explanation
3,Te-4,Closed with explanation
4,Te-5,Closed with explanation


In [79]:
predi.to_csv('prediction.csv')