In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import os

In [46]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
train.columns

Index(['Complaint-ID', 'Date-received', 'Transaction-Type', 'Complaint-reason',
       'Company-response', 'Date-sent-to-company', 'Complaint-Status',
       'Consumer-disputes', 'Consumer-complaint-summary'],
      dtype='object')

In [47]:
y = train['Complaint-Status']


In [48]:
df_null = train.isnull().sum().sort_values(ascending=False)
df_null_percent = (train.isnull().sum() / train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([df_null,df_null_percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(9)

Unnamed: 0,Total,Percent
Company-response,22506,0.520178
Consumer-disputes,7698,0.177923
Consumer-complaint-summary,0,0.0
Complaint-Status,0,0.0
Date-sent-to-company,0,0.0
Complaint-reason,0,0.0
Transaction-Type,0,0.0
Date-received,0,0.0
Complaint-ID,0,0.0


In [49]:
# class_le = LabelEncoder()
# z = class_le.fit_transform(train['Complaint-Status'])
# z

In [50]:
train.groupby('Complaint-Status').nunique()

Unnamed: 0_level_0,Complaint-ID,Date-received,Transaction-Type,Complaint-reason,Company-response,Date-sent-to-company,Complaint-Status,Consumer-disputes,Consumer-complaint-summary
Complaint-Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Closed,809,458,9,46,9,451,1,2,809
Closed with explanation,34300,920,18,149,10,930,1,2,34121
Closed with monetary relief,2818,820,17,116,9,804,1,2,2818
Closed with non-monetary relief,5018,889,17,118,10,883,1,2,4999
Untimely response,321,257,12,48,0,256,1,1,321


In [51]:
# train['Company-response'].isnull().sum()
train.drop('Company-response', axis=1, inplace=True)
train.drop('Complaint-Status', axis=1, inplace=True)
test.drop('Company-response', axis=1, inplace=True)

# train.columns

In [52]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


X = pd.DataFrame(train[['Consumer-disputes']])
te_x = pd.DataFrame(train[['Consumer-disputes']])
train[['Consumer-disputes']] = DataFrameImputer().fit_transform(X)
test[['Consumer-disputes']] = DataFrameImputer().fit_transform(te_x)

In [53]:
from collections import defaultdict
d = defaultdict(LabelEncoder)

fit = train.apply(lambda x: d[x.name].fit_transform(x))
fit_test = test.apply(lambda x: d[x.name].fit_transform(x))
fit.head()

Unnamed: 0,Complaint-ID,Date-received,Transaction-Type,Complaint-reason,Date-sent-to-company,Consumer-disputes,Consumer-complaint-summary
0,0,128,10,78,135,1,32712
1,11111,735,5,71,742,0,39918
2,22222,552,0,145,559,0,5429
3,33333,131,7,36,138,0,35360
4,37711,895,3,100,905,0,22874


In [54]:
fit.drop('Complaint-ID', axis=1, inplace=True)
fit_test.drop('Complaint-ID', axis=1, inplace=True)
fit.columns

Index(['Date-received', 'Transaction-Type', 'Complaint-reason',
       'Date-sent-to-company', 'Consumer-disputes',
       'Consumer-complaint-summary'],
      dtype='object')

In [55]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
fit_std = stdsc.fit_transform(fit)
fit_std = pd.DataFrame(fit_std)
fit_test_std = stdsc.fit_transform(fit_test)
fit_test_std = pd.DataFrame(fit_test_std)
fit_std.head()

Unnamed: 0,0,1,2,3,4,5
0,-1.241213,0.893237,0.378599,-1.238773,2.115005,0.902535
1,1.087503,-0.422784,0.179011,1.082575,-0.472812,1.482746
2,0.385435,-1.738804,2.288944,0.382729,-0.472812,-1.294234
3,-1.229704,0.103625,-0.818931,-1.2273,-0.472812,1.115746
4,1.701332,-0.949192,1.005877,1.705935,-0.472812,0.1104


In [56]:
fit_std.columns = [ 'Date-received', 'Transaction-Type', 'Complaint-reason','Date-sent-to-company','Consumer-disputes', 'Consumer-complaint-summary']
fit_test_std.columns = [ 'Date-received', 'Transaction-Type', 'Complaint-reason','Date-sent-to-company','Consumer-disputes', 'Consumer-complaint-summary']

In [57]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(fit,y, test_size=0.3, random_state=1 )

In [58]:
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
feat_label = fit.columns[:]
forest = RandomForestClassifier(n_estimators=1400, max_depth=12, min_samples_leaf=4, max_features=0.5, n_jobs=-1, random_state=0)
forest.fit(X_train,y_train) #in the random forest we dont need to standardize the data
importances = forest.feature_importances_
indeces = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print(feat_label[indeces[f]], importances[indeces[f]])


Transaction-Type 0.23584660700982804
Consumer-complaint-summary 0.22291872706071725
Date-received 0.17664938380724904
Date-sent-to-company 0.1739037448087869
Complaint-reason 0.15113247236748822
Consumer-disputes 0.039549064945930856


In [59]:
y_pred = forest.predict(X_test)

In [60]:
random_cm = confusion_matrix(y_test,y_pred)
random_cm


array([[    0,   250,     0,     0,     0],
       [    0, 10183,    44,    23,     0],
       [    0,   816,    55,     1,     0],
       [    0,  1481,    16,    24,     0],
       [    0,    87,     0,     0,     0]])

In [61]:
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
print(accuracy_score(y_test,y_pred))

0.7906009244992296


In [41]:
print(len(y_pred))

12980


In [62]:
y_test = forest.predict(fit_test)

In [63]:
predicted = pd.DataFrame(y_test)
predicted.head()

Unnamed: 0,0
0,Closed with explanation
1,Closed with explanation
2,Closed with explanation
3,Closed with explanation
4,Closed with explanation


In [65]:
predicted.size

18543

In [71]:
predi = predicted.reset_index()
predi.columns = ['Complaint-ID', 'Complaint-Status' ]
predi['Complaint-ID'] = predi.index +1
predi['Complaint-ID'] = 'Te-' + predi['Complaint-ID'].astype(str)
predi.head()

Unnamed: 0,Complaint-ID,Complaint-Status
0,Te-1,Closed with explanation
1,Te-2,Closed with explanation
2,Te-3,Closed with explanation
3,Te-4,Closed with explanation
4,Te-5,Closed with explanation


In [73]:
predi.to_csv('prediction.csv')