In [22]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

#df.head()

In [24]:
train_df.shape

(43266, 9)

In [25]:
train_df.columns

Index(['Complaint-ID', 'Date-received', 'Transaction-Type', 'Complaint-reason',
       'Company-response', 'Date-sent-to-company', 'Complaint-Status',
       'Consumer-disputes', 'Consumer-complaint-summary'],
      dtype='object')

In [26]:
#Data preprocessing
train_data = train_df.drop(['Consumer-complaint-summary','Complaint-ID','Date-received','Date-sent-to-company','Complaint-Status'],axis=1)
test_data = test_df.drop(['Consumer-complaint-summary','Complaint-ID','Date-received','Date-sent-to-company'],axis=1)


In [27]:
#Data preprocessing
#retaining only alphabets and removing everything else.

text_headers = ['Transaction-Type', 'Complaint-reason','Company-response']

for header in text_headers:
    train_data[header] = train_data[header].str.replace("[^a-zA-Z]", " ")
    test_data[header] = test_data[header].str.replace("[^a-zA-Z]", " ")
    
train_data.head()

Unnamed: 0,Transaction-Type,Complaint-reason,Company-response,Consumer-disputes
0,Mortgage,Loan servicing payments escrow account,,Yes
1,Credit reporting,Incorrect information on credit report,Company chooses not to provide a public response,No
2,Bank account or service,Using a debit or ATM card,,No
3,Debt collection,Cont d attempts collect debt not owed,Company believes it acted appropriately as aut...,No
4,Credit card,Payoff process,Company has responded to the consumer and the ...,No


In [28]:
print(train_data.isnull().sum())
print(test_data.isnull().sum())
#print(label.isnull().sum())

Transaction-Type         0
Complaint-reason         0
Company-response     22506
Consumer-disputes     7698
dtype: int64
Transaction-Type        0
Complaint-reason        0
Company-response     9701
Consumer-disputes    3304
dtype: int64


In [29]:
#Filling NaN values
train_data['Company-response'].fillna('Not provided',inplace=True)
train_data['Consumer-disputes'].fillna(train_data['Consumer-disputes'].mode()[0],inplace=True)
#------------------------------
test_data['Company-response'].fillna('Not provided',inplace=True)
test_data['Consumer-disputes'].fillna(test_data['Consumer-disputes'].mode()[0],inplace=True)


In [30]:
#data['Transaction-Type'].unique()
print(train_data.isnull().sum())
print(test_data.isnull().sum())

Transaction-Type     0
Complaint-reason     0
Company-response     0
Consumer-disputes    0
dtype: int64
Transaction-Type     0
Complaint-reason     0
Company-response     0
Consumer-disputes    0
dtype: int64


In [31]:
#One hot encoding


train = pd.get_dummies(train_data)
test = pd.get_dummies(test_data)



In [32]:
print(train.shape)
print(test.shape)

(43266, 181)
(18543, 178)


In [33]:
label = train_df['Complaint-Status']
label.unique()
#print(train_df['Complaint-Status'].isnull().sum())

array(['Closed with explanation', 'Closed with non-monetary relief',
       'Closed', 'Closed with monetary relief', 'Untimely response'],
      dtype=object)

In [34]:
label.value_counts()

Closed with explanation            34300
Closed with non-monetary relief     5018
Closed with monetary relief         2818
Closed                               809
Untimely response                    321
Name: Complaint-Status, dtype: int64

In [35]:
#Label Encoding
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(label)
label_converted = label_encoder.transform(label)
label_converted

array([1, 3, 1, ..., 1, 1, 1])

In [36]:
#list(set(train.columns)-set(test.columns))
output = list(set(train.columns) - set(test.columns))
#list(set(train.columns) - set(test.columns))

for column in output:
    test = test.set_index(test.index)
    test[column]=0
    
print(output)
    
    





['Complaint-reason_Account terms and changes', 'Complaint-reason_Was approved for a loan  but didn t receive the money', 'Complaint-reason_Problem with an overdraft', 'Complaint-reason_Incorrect exchange rate', 'Complaint-reason_Advertising']


In [37]:
print(train.shape)
print(test.shape)
test.columns

(43266, 181)
(18543, 183)


Index(['Transaction-Type_Bank account or service',
       'Transaction-Type_Checking or savings account',
       'Transaction-Type_Consumer Loan', 'Transaction-Type_Credit card',
       'Transaction-Type_Credit card or prepaid card',
       'Transaction-Type_Credit reporting',
       'Transaction-Type_Credit reporting  credit repair services  or other personal consumer reports',
       'Transaction-Type_Debt collection',
       'Transaction-Type_Money transfer  virtual currency  or money service',
       'Transaction-Type_Money transfers',
       ...
       'Company-response_Company disputes the facts presented in the complaint',
       'Company-response_Company has responded to the consumer and the CFPB and chooses not to provide a public response',
       'Company-response_Not provided', 'Consumer-disputes_No',
       'Consumer-disputes_Yes', 'Complaint-reason_Account terms and changes',
       'Complaint-reason_Was approved for a loan  but didn t receive the money',
       'Complain

In [38]:
op = list(set(test.columns)-set(train.columns))
print(op)
for col in op:
    train = train.set_index(train.index)
    train[col]=0
#op = ['Complaint-reason_Problem with cash advance', 'Complaint-reason_Can t stop withdrawals from your bank account']  


['Complaint-reason_Can t stop withdrawals from your bank account', 'Complaint-reason_Problem with cash advance']


In [39]:
train['Complaint-reason_Can t stop withdrawals from your bank account']

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
43236    0
43237    0
43238    0
43239    0
43240    0
43241    0
43242    0
43243    0
43244    0
43245    0
43246    0
43247    0
43248    0
43249    0
43250    0
43251    0
43252    0
43253    0
43254    0
43255    0
43256    0
43257    0
43258    0
43259    0
43260    0
43261    0
43262    0
43263    0
43264    0
43265    0
Name: Complaint-reason_Can t stop withdrawals from your bank account, Length: 43266, dtype: int64

In [40]:
print(train.shape)
print(test.shape)

(43266, 183)
(18543, 183)


In [41]:
print(train.isnull())
#print(test.isnull().sum())

       Transaction-Type_Bank account or service  \
0                                         False   
1                                         False   
2                                         False   
3                                         False   
4                                         False   
5                                         False   
6                                         False   
7                                         False   
8                                         False   
9                                         False   
10                                        False   
11                                        False   
12                                        False   
13                                        False   
14                                        False   
15                                        False   
16                                        False   
17                                        False   
18                             

In [None]:
train.head(30)

In [None]:
'''from sklearn.model_selection import GridSearchCV

clf = LogisticRegression()
paramgrid = {'C':list(range(1,20,2)), 'penalty':['l1', 'l2']}
gridsearch = GridSearchCV(clf, param_grid)
gridsearch.fit(train, label_converted);'''

In [None]:
#gridsearch.best_params_
# output - C= 1, penalty='l1'

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold


score = 0.0
#Stratified k-fold technique
i=1
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True)

for train_index, test_index in kf.split(train,label_converted):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xval = train.loc[train_index],train.loc[test_index]
    ytr,yval = label_converted[train_index],label_converted[test_index]
          
    model = LogisticRegression(random_state=1,C= 1, penalty='l1')
    model.fit(xtr,ytr)
    pred_test = model.predict(xval)
    score += f1_score(pred_test,yval,average='weighted')
    print('fscore: ',f1_score(pred_test,yval,average='weighted'))
    i+=1
    
print('final val score: ',score/5)

#0.718576163223847 solver=sag,saga,newton-cg




#0.7229720579457285
#score-0.71659


1 of kfold 5


  'recall', 'true', average, warn_for)


fscore:  0.8730310452085437

2 of kfold 5


  'recall', 'true', average, warn_for)


fscore:  0.872977510741755

3 of kfold 5


  'recall', 'true', average, warn_for)


fscore:  0.8726653625966113

4 of kfold 5


  'recall', 'true', average, warn_for)


fscore:  0.8798917573293629

5 of kfold 5




fscore:  0.8759593936265406
final val score:  0.8749050139005627


  'recall', 'true', average, warn_for)


In [43]:
label_pred = model.predict(test)

In [44]:
submission = pd.DataFrame(data=test_df['Complaint-ID'])
submission['Complaint-ID'] = test_df['Complaint-ID']

text_label = label_encoder.inverse_transform(label_pred)

submission['Complaint-Status'] = text_label
submission.to_csv('submission6.csv',index=False)