In [None]:
%%writefile ml_clf.py
import numpy as np
import pandas as pd 

import seaborn as sn
import matplotlib.pylab as plt 
%matplotlib inline


import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
import re 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from contractions import contractions_dict

import scipy
from scipy import sparse

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score,recall_score,precision_score
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier



train=pd.read_csv('train.csv')
test=pd.read_csv('test_with_solutions.csv')

train_comment = train['Comment']
test_comment = test['Comment']
train_label = train['Insult']
test_label = test['Insult']

Data_to_clean = pd.concat([train_comment,test_comment],axis=0)

def remove_characters_before_tokenization(text):
    text = text.strip()
    return re.sub(r'[^a-zA-Z0-9\' ]', r'', text)

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

def remove_stopwords(text):
    stopword_list = nltk.corpus.stopwords.words('english')
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def expand_contra(sentence, contractions_dict):
    contras = re.findall(r'\w+\'\w+', sentence)
    for i in contras:
        expanded_contraction = contractions_dict.get(i)\
                               if contractions_dict.get(i)\
                               else contractions_dict.get(i.lower())
        if expanded_contraction:
            sentence = re.sub(i, expanded_contraction, sentence)
    return sentence

def normalize_corpus(corpus, tokenize=False):
    normalized_corpus = []
    for index, text in enumerate(corpus):
        try:
            text = expand_contra(text, contractions_dict)
        except:
            print(index)
        text = remove_stopwords(text)
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
    return normalized_corpus

def feat_extract(data,ngram_range):
    vectorizer = CountVectorizer(min_df=1,ngram_range=ngram_range)
    feature = vectorizer.fit_transform(data)
    return(vectorizer,feature)

def tfidf_transformer(matrix):
    transform = TfidfTransformer(norm='l2',smooth_idf=True,use_idf=True)
    tfidf_matrix = transform.fit_transform(matrix)
    
    return(transform, tfidf_matrix)

def metrics(clf_lst, X_test, y_test):
    
    metrics = []
    for clf in clf_lst:
        metrics_lst = []
        cross_val_score(estimator=clf,X=X_test,y=y_test,cv=5)
        y_pred = clf.predict(X_test)
        metrics_lst.append(accuracy_score(y_true=y_test,y_pred=y_pred))
        metrics_lst.append(f1_score(y_true=y_test,y_pred=y_pred,average='weighted'))
        metrics_lst.append(recall_score(y_true=y_test,y_pred=y_pred,average='weighted'))
        metrics_lst.append(precision_score(y_true=y_test,y_pred=y_pred,average='weighted'))
        
        metrics.append(metrics_lst)
    return metrics



if __name__ == '__main__':
    Data_to_clean1 = [remove_characters_before_tokenization(i) for i in Data_to_clean]
    normalized_data = normalize_corpus(corpus=Data_to_clean1,tokenize=False)
    
    train_corpus = normalized_data[:3947]
    test_corpus = normalized_data[3947:]
    
    train_vec,train_feat = feat_extract(data=train_corpus,ngram_range=(1,3))
    train_features = train_feat.todense()
    test_features = train_vec.transform(test_corpus).todense()
    
    train_transform , train_matrix = tfidf_transformer(train_features)
    train_final_feature = train_matrix.todense()
    test_final_feature = train_transform.transform(test_features).todense()
    
    X_training,X_testing=sparse.csr_matrix(train_final_feature),sparse.csr_matrix(test_final_feature)
    
    classifiers_lst = []
    
    NB = MultinomialNB()
    NB.fit(X=X_training,y=train_label)
    classifiers_lst.append(NB)
    
    SGD = SGDClassifier()
    SGD.fit(X=X_training,y=train_label)
    classifiers_lst.append(SGD)
    
    LogReg = LogisticRegression()
    LogReg.fit(X=X_training,y=train_label)
    classifiers_lst.append(LogReg)
    
    GB = GradientBoostingClassifier()
    GB.fit(X=X_training,y=train_label)
    classifiers_lst.append(GB)
    
    RF = RandomForestClassifier()
    RF.fit(X=X_training,y=train_label)
    classifiers_lst.append(RF)
    
    metrics = metrics(classifiers_lst, X_testing, test_label)  
    
    df = pd.DataFrame(metrics, 
             columns=['Accuracy', 'F1_score', 'recall_score', 'precision_score'], 
             index = ['Naive Bayes', "SGD", "Logistic Regression", 'GradientBoosting', 'RandomForest'])
    
    df.T.plot()
    plt.legend(loc='lower right')

    

In [1]:
%%writefile utils.py
import numpy as np
import pandas as pd 

import seaborn as sn
import matplotlib.pylab as plt 
%matplotlib inline


import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
import re 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from contractions import contractions_dict

import scipy
from scipy import sparse


def remove_characters_before_tokenization(text):
    text = text.strip()
    return re.sub(r'[^a-zA-Z0-9\' ]', r'', text)

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

def remove_stopwords(text):
    stopword_list = nltk.corpus.stopwords.words('english')
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def expand_contra(sentence, contractions_dict):
    contras = re.findall(r'\w+\'\w+', sentence)
    for i in contras:
        expanded_contraction = contractions_dict.get(i)\
                               if contractions_dict.get(i)\
                               else contractions_dict.get(i.lower())
        if expanded_contraction:
            sentence = re.sub(i, expanded_contraction, sentence)
    return sentence

def normalize_corpus(corpus, tokenize=False):
    normalized_corpus = []
    for index, text in enumerate(corpus):
        try:
            text = expand_contra(text, contractions_dict)
        except:
            print(index)
        text = remove_stopwords(text)
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
    return normalized_corpus

def feat_extract(data,ngram_range):
    vectorizer = CountVectorizer(min_df=1,ngram_range=ngram_range)
    feature = vectorizer.fit_transform(data)
    return(vectorizer,feature)

def tfidf_transformer(matrix):
    transform = TfidfTransformer(norm='l2',smooth_idf=True,use_idf=True)
    tfidf_matrix = transform.fit_transform(matrix)
    
    return(transform, tfidf_matrix)

Writing utils.py


In [2]:
%%writefile ml_clf.py
import utils

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score,recall_score,precision_score
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

train=pd.read_csv('train.csv')
test=pd.read_csv('test_with_solutions.csv')

train_comment = train['Comment']
test_comment = test['Comment']
train_label = train['Insult']
test_label = test['Insult']



def data_clean(train_comment, test_comment):
    Data_to_clean = pd.concat([train_comment,test_comment],axis=0)
    Data_to_clean1 = [utils.remove_characters_before_tokenization(i) for i in Data_to_clean]
    normalized_data = utils.normalize_corpus(corpus=Data_to_clean1,tokenize=False)
    
    train_corpus = normalized_data[:3947]
    test_corpus = normalized_data[3947:]
    
    train_vec,train_feat = utils.feat_extract(data=train_corpus,ngram_range=(1,3))
    train_features = train_feat.todense()
    test_features = train_vec.transform(test_corpus).todense()
    
    train_transform , train_matrix = utils.tfidf_transformer(train_features)
    train_final_feature = train_matrix.todense()
    test_final_feature = train_transform.transform(test_features).todense()
    
    X_training,X_testing=sparse.csr_matrix(train_final_feature),sparse.csr_matrix(test_final_feature)
    
    return X_training, X_testing

def ml_implement(X_train, y_train, X_test, y_test):
    
    classifiers_lst = []
    
    NB = MultinomialNB()
    NB.fit(X=X_train,y=y_train)
    classifiers_lst.append(NB)
    
    SGD = SGDClassifier()
    SGD.fit(X=X_train,y=y_train)
    classifiers_lst.append(SGD)
    
    LogReg = LogisticRegression()
    LogReg.fit(X=X_train,y=y_train)
    classifiers_lst.append(LogReg)
    
    GB = GradientBoostingClassifier()
    GB.fit(X=X_train,y=y_train)
    classifiers_lst.append(GB)
    
    RF = RandomForestClassifier()
    RF.fit(X=X_train,y=y_train)
    classifiers_lst.append(RF)
    
    metrics = utils.metrics(classifiers_lst, X_test, y_test)  
    
    df = pd.DataFrame(metrics, 
             columns=['Accuracy', 'F1_score', 'recall_score', 'precision_score'], 
             index = ['Naive Bayes', "SGD", "Logistic Regression", 'GradientBoosting', 'RandomForest'])
    
    df.T.plot(kind='bar', figsize = (10, 10), )
    plt.xticks(rotation=360)
    plt.legend(loc='lower right')
    plt.savefig('clf_metrics.png')
    
    return df

if __name__ == "__main__":
    
    X_train, X_test = data_clean(train_comment, test_comment)
    clf_df = ml_implement(X_train, train_label, X_test, test_label)
    
    
    
    

Writing ml_clf.py


In [8]:
run ml_clf.py



NameError: name 'cross_val_score' is not defined

In [10]:
run ml_clf.py

SyntaxError: invalid syntax (ml_clf.py, line 1)

In [11]:
%load_ext autoreload
%autoreload 2

In [16]:
run ml_clf.py



NameError: name 'cross_val_score' is not defined

In [4]:
anti_vac = pd.read_csv('antiva_dataset.csv')

NameError: name 'pd' is not defined

In [18]:
anti_vac.head()

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,...,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,text_tokens
0,0,1514630855423332354,1514630855423332354,1649951000000.0,2022-04-14 10:45:11,-500,,Here’s another milestone unlocked! We are deli...,en,['covlex'],...,,,,,[],,,,,
1,1,1514630825174003714,1514630825174003714,1649951000000.0,2022-04-14 10:45:03,-500,,India Covid-19 Vaccination Update: 14-Apr-2022...,en,"['largestvaccinedrive', 'largestvaccinationdri...",...,,,,,[],,,,,
2,3,1514630801085775875,1514615791005577229,1649951000000.0,2022-04-14 10:44:58,-500,,@CTVNews Healthy kids are better off without v...,en,[],...,,,,,"[{'screen_name': 'CTVNews', 'name': 'CTV News'...",,,,,
3,6,1514630751521812480,1514418518946582531,1649951000000.0,2022-04-14 10:44:46,-500,,@ZaleskiLuke @Acyn And the rest of the Greed O...,en,[],...,,,,,"[{'screen_name': 'ZaleskiLuke', 'name': 'Luke ...",,,,,
4,10,1514630611209789442,1514630611209789442,1649951000000.0,2022-04-14 10:44:12,-500,,Our Mobile Vaccination Unit will be back at Mo...,en,[],...,,,,,[],,,,,


In [19]:
anti_vac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9432 entries, 0 to 9431
Data columns (total 40 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       9432 non-null   int64  
 1   id               9432 non-null   int64  
 2   conversation_id  9432 non-null   int64  
 3   created_at       9432 non-null   float64
 4   date             9432 non-null   object 
 5   timezone         9432 non-null   int64  
 6   place            1 non-null      object 
 7   tweet            9432 non-null   object 
 8   language         9432 non-null   object 
 9   hashtags         9432 non-null   object 
 10  cashtags         9432 non-null   object 
 11  user_id          9432 non-null   int64  
 12  user_id_str      9432 non-null   int64  
 13  username         9432 non-null   object 
 14  name             9430 non-null   object 
 15  day              9432 non-null   int64  
 16  hour             9432 non-null   int64  
 17  link          

In [23]:
da = anti_vac['tweet']

In [2]:
import utils

In [21]:
import ml_clf

In [3]:
Data_to_clean1 = [utils.remove_characters_before_tokenization(i) for i in da]
normalized_data = utils.normalize_corpus(corpus=Data_to_clean1,tokenize=False)

NameError: name 'da' is not defined

In [1]:
train_vec,train_feat = utils.feat_extract(data=normalized_data,ngram_range=(1,3))
train_features = train_feat.todense()

train_transform , train_matrix = utils.tfidf_transformer(train_features)
train_final_feature = train_matrix.todense()

X_training = sparse.csr_matrix(train_final_feature)

NameError: name 'utils' is not defined

In [5]:
%%writefile pc_analysis.py
import utils

from wordcloud import WordCloud, STOPWORDS
import string

import pandas as pd
import numpy as np

def data_clean(df):
    data = [utils.remove_characters_before_tokenization(i) for i in df]
    data1= utils.normalize_corpus(corpus=data,tokenize=False)
    
    text = " ".join(tweet for tweet in data1)
    return text

def plot_wordcloud(text):
    stopwords = set(STOPWORDS)
    stopwords.update(list(string.punctuation) + ["https", "people", 'think', 'will', 's', 'others', "one", "politically correct", "politically", "correct", "political correctness", "political", "correctness", "sensitive", 'covid','covid-19', 'covid19', "vaccines", 'vaxxer', 'vaxxers', 't', 'co', 'pandemic', 'anti-vaccine', 'amp'] + ['considerate', 'diplomatic', 'gender free', 'inclusive', 'inoffensive', 'multicultural', 'multiculturally sensitive', 'politic', 'respectful', 'sensitive', 'sensitive to others', 'bias free', 'liberal', 'nondiscriminatory', 'nonracist', 'nonsexist', 'unbiased', 'political correctness', 'politically correct'])



    wordcloud = WordCloud(stopwords=stopwords, background_color="white", min_word_length=4, collocation_threshold=4).generate_from_text(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.savefig('pc_wordcloud.png')
    
if __name__ == '__main__':
    df_pc= pd.read_csv('pc_dataset.csv', index_col=0)
    text = data_clean(df_pc['tweet'])



Writing pc_analysis.py
