In [16]:
import pandas as pd


In [17]:
df = pd.read_csv('corpus.csv')

In [18]:
df.dropna(inplace=True)

In [19]:
corpus = list(df['messages'])

In [20]:
from sklearn.feature_extraction.text import CountVectorizer


In [21]:
cv = CountVectorizer(max_features = 1000)
X = cv.fit_transform(corpus).toarray()

In [22]:
X.shape

(51473, 1000)

In [23]:
y = df['target'].values

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2,random_state=42)

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score


In [88]:
def classifier(clf,X_train, X_test, y_train, y_test):
  clf.fit(X_train,y_train)
  print("Train Results \n")
  y_train_pred  = clf.predict(X_train)
  y_train_prob = clf.predict_proba(X_train)[:,1]

  print("Confusion Matrix for Train : \n", confusion_matrix(y_train, y_train_pred))
  print("Accuracy Score for Train : ", accuracy_score(y_train, y_train_pred))
  print("ROC AUC for Train : ", roc_auc_score(y_train, y_train_prob))

  print("Test Results \n")
  y_test_pred  = clf.predict(X_test)
  y_test_prob = clf.predict_proba(X_test)[:,1]


  print("Confusion Matrix for Test : \n", confusion_matrix(y_test, y_test_pred))
  print("Accuracy Score for Test : ", accuracy_score(y_test, y_test_pred))
  print("ROC AUC for Test : ", roc_auc_score(y_test, y_test_prob))



## Random Forest Classifier

In [26]:
rfc = RandomForestClassifier(max_depth=15,max_features=50,n_estimators=400,random_state=4)

In [89]:
classifier(rfc,X_train, X_test, y_train, y_test)

Train Results 

Confusion Matrix for Train : 
 [[ 8253  4966]
 [ 2253 25706]]
Accuracy Score for Train :  0.8246879401622226
ROC AUC for Train :  0.9256125343275976
Test Results 

Confusion Matrix for Test : 
 [[1866 1460]
 [ 726 6243]]
Accuracy Score for Test :  0.7876639145216124
ROC AUC for Test :  0.9008317005979664


## Light GBM

normal lgb with params

In [90]:
import lightgbm as lgb

In [91]:
d_train=lgb.Dataset(X_train, label=y_train)

In [114]:
params={}
params['learning_rate']=0.01
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='binary' #Binary target feature
params['metric']='binary_logloss' #metric for binary classification
params['max_depth']=20
#train the model 
clf=lgb.train(params,d_train,500) #train the model on 100 epocs
#prediction on the test set
y_pred_train = clf.predict(X_train)
y_pred=clf.predict(X_test)

In [115]:
y_pred_round = []
for i in y_pred:
  if i<0.5:
    y_pred_round.append(0)
  else:
    y_pred_round.append(1)
y_train_round = []
for i in y_pred_train:
  if i<0.5:
    y_train_round.append(0)
  else:
    y_train_round.append(1)

In [118]:
print("Training Acc: ",accuracy_score(y_train_round, y_train))
print("Testing Acc: ",accuracy_score(y_pred_round, y_test))

Training Acc:  0.8274321239496819
Testing Acc:  0.8142787761049053


lgb classifier


boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


In [121]:
clf = lgb.LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, importance_type='split', learning_rate=0.01, max_depth=20, min_child_samples=20, min_child_weight=0.001, n_estimators=600, n_jobs=-1, num_leaves=31, objective='binary', random_state=42, reg_alpha=0.8, reg_lambda=0.6, silent=True)
classifier(clf,X_train, X_test, y_train, y_test)

Train Results 

Confusion Matrix for Train : 
 [[10027  3192]
 [ 3870 24089]]
Accuracy Score for Train :  0.8285006556899315
ROC AUC for Train :  0.9267941936668251
Test Results 

Confusion Matrix for Test : 
 [[2419  907]
 [1018 5951]]
Accuracy Score for Test :  0.8130160271976687
ROC AUC for Test :  0.9156018401913396


# TEST

In [46]:
test = pd.read_csv('test.csv')

In [47]:
id = test['Id']

In [48]:
test = test[['Subject','Body']]

In [49]:
test['messages'] = test['Subject']+' '+test['Body']

In [53]:
import re
import nltk
import ssl
##ssl error aaya toh
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [54]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [58]:
ps = PorterStemmer()
test_corpus = []

In [61]:
for i in range(len(test)):
    review = re.sub('[^a-zA-Z]',' ',str(test['messages'][i]))
    review = review.lower().split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    test_corpus.append(review)

In [69]:
test_data = cv.fit_transform(test_corpus).toarray()

In [71]:
test_data.shape

(4898, 1000)

## Different classifiers for test data

In [74]:
test_prob = rfc.predict_proba(test_data)[:,1]

In [126]:
test_prob = clf.predict_proba(test_data)[:,1]

In [127]:
test_prob

array([0.76865012, 0.93623723, 0.69842507, ..., 0.90759659, 0.61417583,
       0.4478599 ])

## Creation of submission,csv from here

In [131]:
submission = pd.DataFrame()

In [132]:
submission['Id'] = id

In [133]:
submission['Flag'] = test_prob

In [134]:
submission = submission.set_index('Id')

In [136]:
submission.to_csv('submission.csv')