In [0]:
! pip install -q kaggle
from google.colab import files

files.upload()

In [0]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [3]:
! kaggle competitions download -c 'jigsaw-toxic-comment-classification-challenge'

Downloading train.csv.zip to /content
 19% 5.00M/26.3M [00:00<00:00, 37.4MB/s]
100% 26.3M/26.3M [00:00<00:00, 104MB/s] 
Downloading sample_submission.csv.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 145MB/s]
Downloading test.csv.zip to /content
 90% 21.0M/23.4M [00:00<00:00, 93.4MB/s]
100% 23.4M/23.4M [00:00<00:00, 92.5MB/s]
Downloading test_labels.csv.zip to /content
  0% 0.00/1.46M [00:00<?, ?B/s]
100% 1.46M/1.46M [00:00<00:00, 199MB/s]


In [4]:
! mkdir /content/dataset
! unzip /content/train.csv.zip -d /content/dataset
! unzip /content/test.csv.zip -d /content/dataset
! unzip /content/test_labels.csv.zip -d /content/dataset


Archive:  /content/train.csv.zip
  inflating: /content/dataset/train.csv  
Archive:  /content/test.csv.zip
  inflating: /content/dataset/test.csv  
Archive:  /content/test_labels.csv.zip
  inflating: /content/dataset/test_labels.csv  


In [0]:
import pandas as pd

In [6]:
train = pd.read_csv('/content/dataset/train.csv')
train.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:

val = train['toxic'] + train['severe_toxic'] + train['obscene'] + train['threat'] + train['identity_hate']
new_list = []
for x in val:
  if x == 0:
    new_list.append(0)
  else:
    new_list.append(1)
new_df = pd.DataFrame(list(zip(train['id'], train['comment_text'], new_list)), columns= ['id', 'comment_text', 'toxicity'])
new_df

Unnamed: 0,id,comment_text,toxicity
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0
...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0


TF-IDF

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer 
docs = new_df['comment_text']
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

**Logistic Regression**

In [0]:
# from scipy import sparse
X = tfidf_vectorizer_vectors
Y = new_df['toxicity']

In [0]:
from sklearn.model_selection import train_test_split
# Split data set into training and test sets
X_train_old, X_test_old, y_train_old, y_test_old = train_test_split(X, Y, test_size=0.2, random_state=101)

In [32]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(max_iter=1000)
logmodel.fit(X_train_old, y_train_old)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
y_pred_old = logmodel.predict(X_test_old)

In [34]:
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test_old, y_pred_old) 
  
print ("Confusion Matrix : \n", cm) 

Confusion Matrix : 
 [[28561   141]
 [ 1276  1937]]


In [35]:
from sklearn.metrics import accuracy_score 
print ("Accuracy : ", accuracy_score(y_test_old, y_pred_old))

Accuracy :  0.9556008146639511


In [36]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test_old, y_pred_old)

0.04439918533604888

**Logistic Regression Using SMOTE**

In [38]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=101)
X_res, y_res = sm.fit_resample(X, Y)



In [39]:
print('Before using smote x: {} y: {}'.format(X.shape,len(Y)))

Before using smote x: (159571, 189775) y: 159571


In [40]:
print('After using smote x: {} y: {}'.format(X_res.shape,len(y_res)))


After using smote x: (287294, 189775) y: 287294


In [0]:
from sklearn.model_selection import train_test_split
# Split data set into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=101)

In [42]:
from sklearn.linear_model import LogisticRegression
logmodel_new = LogisticRegression(max_iter=1000)
logmodel_new.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
y_pred = logmodel_new.predict(X_test_old)

In [44]:
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test_old, y_pred) 
  
print ("Confusion Matrix : \n", cm) 

Confusion Matrix : 
 [[26707  1995]
 [  315  2898]]


In [45]:
from sklearn.metrics import accuracy_score 
print ("Accuracy : ", accuracy_score(y_test_old, y_pred))

Accuracy :  0.9276202412658624


In [48]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test_old, y_pred)

0.07237975873413756

**Logistic Regression Using Stratified Sampling**

In [0]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=2,shuffle=True,random_state=101)


In [0]:
for train_index, test_index in kf.split(X, Y):
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train_stf, X_test_stf = X[train_index], X[test_index]
  y_train_stf, y_test_stf = Y[train_index], Y[test_index]

TRAIN: [     0      5      6 ... 159564 159567 159570] TEST: [     1      2      3 ... 159566 159568 159569]
TRAIN: [     1      2      3 ... 159566 159568 159569] TEST: [     0      5      6 ... 159564 159567 159570]


In [0]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(max_iter=1000)
logmodel.fit(X_train_stf, y_train_stf)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
y_pred_stf = logmodel.predict(X_test_stf)

In [0]:
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test_stf, y_pred_stf) 
  
print ("Confusion Matrix : \n", cm) 

Confusion Matrix : 
 [[71487   336]
 [ 3471  4491]]


In [0]:
from sklearn.metrics import accuracy_score 
print ("Accuracy : ", accuracy_score(y_test_stf, y_pred_stf))

Accuracy :  0.9522842639593908


**LR using Smote & Stratified Sampling**

In [0]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=2,shuffle=True,random_state=101)


In [0]:
for train_index, test_index in kf.split(X_res, y_res):
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train_stf, X_test_stf = X_res[train_index], X_res[test_index]
  y_train_stf, y_test_stf = y_res[train_index], y_res[test_index]

TRAIN: [     0      5      7 ... 287289 287290 287293] TEST: [     1      2      3 ... 287288 287291 287292]
TRAIN: [     1      2      3 ... 287288 287291 287292] TEST: [     0      5      7 ... 287289 287290 287293]


In [0]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(max_iter=1000)
logmodel.fit(X_train_stf, y_train_stf)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
y_pred_stf = logmodel.predict(X_test_stf)

In [0]:
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test_stf, y_pred_stf) 
  
print ("Confusion Matrix : \n", cm) 

Confusion Matrix : 
 [[64865  6958]
 [ 3563 68261]]


In [0]:
from sklearn.metrics import accuracy_score 
print ("Accuracy : ", accuracy_score(y_test_stf, y_pred_stf))

Accuracy :  0.926757955265338
