In [2]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
train = pd.read_csv('train_E6oV3lV.csv')
test = pd.read_csv('test_tweets_anuFYb8.csv')
train.tail()

Unnamed: 0,id,label,tweet
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."
31961,31962,0,thank you @user for you follow


In [4]:

print("Shape of Training set:", train.shape)
print("Shape of Testing set:", test.shape)

Shape of Training set: (31962, 3)
Shape of Testing set: (17197, 2)


In [5]:
train.drop_duplicates(subset=['tweet'], keep='last', inplace=True)
train.reset_index(inplace=True)
print("Shape of Train unique:", train.shape)

Shape of Train unique: (29530, 4)


In [6]:
train[train['tweet'].map(lambda x: x.isascii())]
test[test['tweet'].map(lambda x: x.isascii())]
#Dataclean
def clean_tweets(text):
    text = re.sub(r'@[A-Za-z0-9_]+','',text) 
    text = re.sub(r'#','',text)                  
    text = re.sub(r'RT[\s]+',' ',text)           
    text = re.sub(r'\n','',text) 
    text = re.sub(r',','',text) 
    text = re.sub(r'.[.]+','',text) 
    text = re.sub(r'\w+:\/\/\S+','',text) 
    text = re.sub(r'https?:\/\/\S+','',text)  
    text = re.sub(r'/',' ',text)
    text = re.sub(r'-',' ',text)
    text = re.sub(r'_',' ',text)
    text = re.sub(r'!','',text)
    text = re.sub(r':',' ',text)
    text = re.sub(r'$','',text)
    text = re.sub(r'%','',text)
    text = re.sub(r'^','',text)
    text = re.sub(r'&','',text)
    text = re.sub(r'=',' ',text)
    text = re.sub(r' +',' ',text) 
    return text

def clean_emojis(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

train['tweet'] = train['tweet'].apply(clean_tweets)  
train['tweet'] = train['tweet'].apply(clean_emojis)  
train['tweet'] = train.tweet.str.lower()  
train['tweet'] = train['tweet'].str.strip()  
test['tweet'] = test['tweet'].apply(clean_tweets)  
test['tweet'] = test['tweet'].apply(clean_emojis) 
test['tweet'] = test.tweet.str.lower()   
test['tweet'] = test['tweet'].str.strip() 

In [7]:
train.head()

Unnamed: 0,index,id,label,tweet
0,0,1,0,when a father is dysfunctional and is so selfi...
1,1,2,0,thanks for lyft credit i can't use cause they ...
2,2,3,0,bihday your majesty
3,4,5,0,factsguide society now motivation
4,5,6,0,[2 2] huge fan fare and big talking before the...


In [8]:
test.head()

Unnamed: 0,id,tweet
0,31963,studiolife aislife requires passion dedication...
1,31964,white supremacists want everyone to see the ne...
2,31965,safe ways to heal your acne altwaystoheal heal...
3,31966,is the hp and the cursed child book up for res...
4,31967,3rd bihday to my amazing hilarious nephew eli ...


In [9]:
df = train.copy()
df.drop(['index'],axis=1, inplace=True)
pd.set_option('display.max_colwidth', None) 
df

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so selfish he drags his kids into his dysfunctio run
1,2,0,thanks for lyft credit i can't use cause they don't offer wheelchair vans in pd disapointed getthanked
2,3,0,bihday your majesty
3,5,0,factsguide society now motivation
4,6,0,[2 2] huge fan fare and big talking before they leav chaos and pay disputes when they get ther allshowandnogo
...,...,...,...
29525,31958,0,ate isz that youuu?
29526,31959,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisol shame imwithher
29527,31960,0,listening to sad songs on a monday morning otw to work is sad
29528,31961,1,sikh temple vandalised in in calgary wso condemns act


1---> Bad/sexist/racist tweet
0---> Regular tweet

In [10]:
print("Dataset shape: ", df.shape)
df['label'].value_counts()

Dataset shape:  (29530, 3)


0    27517
1     2013
Name: label, dtype: int64

#### Cleaning Punctuations:

In [11]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
df['tweet'] = df['tweet'].astype(str)
punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['tweet'] = df['tweet'].apply(lambda x: cleaning_punctuations(x))

#### Cleaning Stopwords:

In [13]:
sw = stopwords.words('english')
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

#### Cleaning Numeric numbers:

In [14]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)
df['tweet'] = df['tweet'].apply(lambda text: cleaning_numbers(text))

#### Tokenizing Tweets:

In [15]:
# Stemming requires tokens, hence convertting tweets into Tokens
tokens = (word_tokenize(i) for i in df.tweet)
df['tweet'] = df['tweet'].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,id,label,tweet
0,1,0,"[father, dysfunctional, selfish, drags, kids, dysfunctio, run]"
1,2,0,"[thanks, lyft, credit, cant, use, cause, dont, offer, wheelchair, vans, pd, disapointed, getthanked]"
2,3,0,"[bihday, majesty]"
3,5,0,"[factsguide, society, motivation]"
4,6,0,"[huge, fan, fare, big, talking, leav, chaos, pay, disputes, get, ther, allshowandnogo]"


### Stemming:

In [16]:
stemm = SnowballStemmer('english')
df['tweet'] = df['tweet'].apply(lambda x: [stemm.stem(y) for y in x])

#### Splitting data into Train and Test sets

In [17]:
X = df['tweet'].astype(str)  # Converting to string, because vectorizer does'nt accept list.
y = df['label'].astype(str)  # Converting to string, because vectorizer does'nt accept list.
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3) # random_state=3 because 3 is my favourite number :D

#### Transforming to TF-IDF Vectorizer

#### Fitting the Count Vectorizer

In [18]:
vectoriser = CountVectorizer(ngram_range=(1,2))
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))  

No. of feature_words:  154719


#### Fitting the TF-IDF Vectorizer

In [19]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

No. of feature_words:  154719


#### Transforming the data using TF-IDF Vectorizer

In [20]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

In [21]:
#Using 4 models
models = {
    
    'SVC' :{
        'model' : SVC(),
        'parameters' : {
            'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf','linear','poly','sigmoid']
        }
    },
    
    'logistics_regression' :{
        'model' : LogisticRegression(solver = 'lbfgs', multi_class = 'auto'),
        'parameters' : {
            'C' : [0.1, 1, 10, 50, 60, 90, 100], 'solver' : ['lbfgs', 'liblinear']
        }
    },

    'MultinomialNB' :{
        'model' : MultinomialNB(),
        'parameters' : {
            'alpha' : np.linspace(0.5, 1.5, 6), 'fit_prior' : [True, False]
        }
    },
        
    'random_forest' :{
        'model' : RandomForestClassifier(),
        'parameters' : {
            'n_estimators' : [80,85,90,95,100], 
            'max_depth':[20,30,None], 'criterion':['gini','entropy']
        }
    }
}

In [22]:
score = []

for model_name, mp in models.items():
    clf = GridSearchCV(mp['model'], mp['parameters'], cv=5, n_jobs=-1)
    print(mp['model'])
    print('\nFitting...')
    best_model = clf.fit(X_train, y_train)
    print('\nFitting...1')
    clf_pred = best_model.predict(X_test)
    
    #confusion matrix
    print(confusion_matrix(y_test,clf_pred))
    print(metrics.classification_report(y_test, clf_pred))
    score.append({
        'model' : model_name,
        'best_accuracy' : best_model.score(X_test, y_test),
        'best_parameters' : clf.best_params_
    })
    print('\nThe score is appended to the list...\n')
    
res = pd.DataFrame(score, columns=['model', 'best_accuracy', 'best_parameters'])
res

SVC()

Fitting...

Fitting...1
[[5376  114]
 [ 142  274]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      5490
           1       0.71      0.66      0.68       416

    accuracy                           0.96      5906
   macro avg       0.84      0.82      0.83      5906
weighted avg       0.96      0.96      0.96      5906


The score is appended to the list...

LogisticRegression()

Fitting...

Fitting...1
[[5426   64]
 [ 173  243]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5490
           1       0.79      0.58      0.67       416

    accuracy                           0.96      5906
   macro avg       0.88      0.79      0.83      5906
weighted avg       0.96      0.96      0.96      5906


The score is appended to the list...

MultinomialNB()

Fitting...

Fitting...1
[[5490    0]
 [ 367   49]]
              precision    recall  f1-score   support

           0  

Unnamed: 0,model,best_accuracy,best_parameters
0,SVC,0.956654,"{'C': 10, 'gamma': 1, 'kernel': 'sigmoid'}"
1,logistics_regression,0.959871,"{'C': 100, 'solver': 'lbfgs'}"
2,MultinomialNB,0.93786,"{'alpha': 0.5, 'fit_prior': True}"
3,random_forest,0.954622,"{'criterion': 'gini', 'max_depth': None, 'n_estimators': 95}"
