In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
import nltk
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import re

In [2]:
for package in ['stopwords','punkt','wordnet']:
    nltk.download(package)
    
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\desla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\desla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\desla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Project 1 - NLP and Text Classification

For this project you will need to classify some angry comments into their respective category of angry. The process that you'll need to follow is (roughly):
<ol>
<li> Use NLP techniques to process the training data. 
<li> Train model(s) to predict which class(es) each comment is in.
    <ul>
    <li> A comment can belong to any number of classes, including none. 
    </ul>
<li> Generate predictions for each of the comments in the test data. 
<li> Write your test data predicitions to a CSV file, which will be scored. 
</ol>

You can use any models and NLP libraries you'd like. Think aobut the problem, look back to see if there's anything that might help, give it a try, and see if that helps. We've regularly said we have a "toolkit" of things that we can use, we generally don't know which ones we'll need, but here you have a pretty simple goal - if it makes it more accurate, it helps. There's not one specific solution here, there are lots of things that you could do. 

## Training Data

Use the training data to train your prediction model(s). Each of the classification output columns (toxic to the end) is a human label for the comment_text, assessing if it falls into that category of "rude". A comment may fall into any number of categories, or none at all. Membership in one output category is <b>independent</b> of membership in any of the other classes (think about this when you plan on how to make these predictions - it may also make it easier to split work amongst a team...). 

In [3]:
train_df = pd.read_csv("train.csv.zip")
train_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
train_df_sub = train_df.sample(n=100000)
train_df_sub

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
7624,14480eaf7404f15f,Msg from flc webmaster -\n\nThis is ridiculous...,0,0,0,0,0,0
93669,fa77bd5aba0e35cd,"I appreciate the work you did, but whether it'...",0,0,0,0,0,0
118093,76d73cdf80cdfdbc,"""\nWell, this is our third AnonTalk filter. ...",0,0,0,0,0,0
72836,c2db783a5d33467c,"""\n\nre:Znypes\nHi, thanks for the support on ...",0,0,0,0,0,0
95742,0005be6eea9c30e8,Jeffrey O. Gustafson|Jeffrey O. Gustafson]] - ...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
79884,d5c3cf814bfa519c,"""The article reads: """"In the first (modern-day...",0,0,0,0,0,0
67110,b39031451f93a664,"""\n\n Khalifa Ezzat sourcing \n\nAHLM13, I too...",0,0,0,0,0,0
8875,17903046a4eb8f13,FUCK OFF MY PAGE!!!!!!!!!!!!!!,1,1,1,0,0,0
10287,1b3819bc57743be7,Dear contributors please read Vanniyar Puranam...,0,0,0,0,0,0


In [9]:
train_df['threat'].value_counts()

0    159093
1       478
Name: threat, dtype: int64

In [7]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,comment_text
0,1,Yo bitch Ja Rule is more succesful then you'll...
1,2,== From RfC == \n\n The title is fine as it is...
2,3,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,4,":If you have a look back at the source, the in..."
4,5,I don't anonymously edit articles at all.


In [48]:
class lemmaTokenizer(object):
    def __init__(self, stop_words):
        self.stop_words = stop_words
        from nltk.stem import WordNetLemmatizer
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        filtered_tok = []
        for tok in tokens:
            if tok not in stop_words:
                tok = re.sub('\W+', '', tok)
                tmp = self.lemmatizer.lemmatize(tok)
                if len(tmp) >= 2:
                    filtered_tok.append(tmp)
        return filtered_tok

tokenize = lemmaTokenizer(stop_words)

In [49]:
test_df['comment_text'] = test_df['comment_text'].apply(lambda x: tokenize(x))
test_df

Unnamed: 0,id,comment_text
0,1,"[Yo, bitch, Ja, Rule, succesful, ll, ever, wha..."
1,2,"[From, RfC, The, title, fine, IMO]"
2,3,"[Sources, Zawe, Ashton, Lapland]"
3,4,"[If, look, back, source, information, updated,..."
4,5,"[nt, anonymously, edit, article]"
...,...,...
153159,153160,"[totally, agree, stuff, nothing, toolongcrap]"
153160,153161,"[Throw, field, home, plate, Does, get, faster,..."
153161,153162,"[Okinotorishima, category, see, change, agree,..."
153162,153163,"[One, founding, nation, EU, Germany, Law, Retu..."


In [5]:
vec_cv = CountVectorizer(max_features=150)
tmp = vec_cv.fit_transform(train_df["comment_text"])
tok_cols = vec_cv.get_feature_names_out()
tok_df = pd.DataFrame(tmp.toarray(), columns=tok_cols)
print("original:",train_df["comment_text"].shape)
print("vectorized:", tmp.shape)
tok_df.head(10)

original: (159571,)
vectorized: (159571, 150)


Unnamed: 0,about,add,after,again,all,also,am,an,and,any,...,who,why,wikipedia,will,with,work,would,wp,you,your
0,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,2,...,0,0,1,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,2,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,2,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=300, stop_words=stop_words)
tmp = vec_tf.fit_transform(train_df["comment_text"])
tok_cols = vec_tf.get_feature_names_out()
tok_df = pd.DataFrame(tmp.toarray(), columns=tok_cols)
print("original:", train_df["comment_text"].shape)
print("vectorized:", tmp.shape)
tok_df.head(10)

original: (159571,)
vectorized: (159571, 300)


Unnamed: 0,account,actually,add,added,adding,address,admin,agree,already,also,...,would,wp,write,written,wrong,www,year,years,yes,yet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.186751,0.0,0.300303,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Model trained for 'toxic' class

stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=300, stop_words=stop_words, ngram_range=(1,3))
model_svc = SVC()

y_toxic = train_df_sub["toxic"]
X_toxic = train_df_sub["comment_text"]

#X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_threat, y_threat)

pipe_toxic = Pipeline([ 
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = ["vec_cv"]

pipe_toxic.fit(X_toxic, y_toxic.ravel())

In [None]:
# Model trained for 'severe_toxic' class

stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=300, stop_words=stop_words, ngram_range=(1,3))
model_svc = SVC()

y_severe_toxic = train_df_sub["severe_toxic"]
X_severe_toxic = train_df_sub["comment_text"]

#X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_threat, y_threat)

pipe_severe_toxic = Pipeline([ 
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = ["vec_cv"]

pipe_severe_toxic.fit(X_severe_toxic, y_severe_toxic.ravel())

In [15]:
# Model trained for 'obscene' class
model_svc = SVC()

stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=300, stop_words=stop_words, ngram_range=(1,3))

y_obscene = train_df_sub["obscene"]
X_obscene = train_df_sub["comment_text"]

X_train_o, X_test_o, y_train_o, y_test_o = train_test_split(X_obscene, y_obscene)

pipe_obscene = Pipeline([ 
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = ["vec_cv"]

pipe_obscene.fit(X_train_o, y_train_o.ravel())
pipe_obscene.score(X_test_o, y_test_o)



ValueError: Found input variables with inconsistent numbers of samples: [75000, 25000]

In [11]:
# Model trained for 'threat' class

stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=500, stop_words=stop_words, ngram_range=(1,3))
model_svc = SVC(class_weight='balanced')

y_threat = train_df_sub["threat"]
X_threat = train_df_sub["comment_text"]

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_threat, y_threat)

pipe_threat = Pipeline([ 
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = ["vec_cv"]

pipe_threat.fit(X_train_t, y_train_t.ravel())
pipe_threat.score(X_test_t, y_test_t)



0.96208

In [74]:
# Trained models applied to the test data to produce predictions
obscene_test = pipe_obscene.predict(test_df['comment_text'])
threat_test = pipe_threat.predict(test_df['comment_text'])

In [12]:
threat_test = pipe_threat.predict(test_df['comment_text'])

In [13]:
# Combining predictions into a data frame 
new_data = {'obscene': obscene_test, 'threat': threat_test}
final_df = pd.DataFrame(new_data)
final_df['comment_text'] = test_df['comment_text']
final_df['id'] = test_df['id']
final_df = final_df[['id', 'comment_text', 'obscene', 'threat']]
final_df.head()

NameError: name 'obscene_test' is not defined

In [14]:
print(final_df['threat'].value_counts())
print(final_df['obscene'].value_counts())

NameError: name 'final_df' is not defined

In [None]:
# Writing the final results into a csv file
final_df.to_csv('out.csv', index=False)

In [24]:
class swTokenizer(object):
    def __init__(self, stop_words):
        self.stop_words = stop_words
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        filtered_tok = []
        for tok in tokens:
            if tok not in stop_words:
                filtered_tok.append(tok)
        return filtered_tok

In [14]:
class stemTokenizer(object):
    def __init__(self, stop_words):
        self.stop_words = stop_words
        from nltk.stem import SnowballStemmer
        self.stemmer = SnowballStemmer(language='english')
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        filtered_tok = []
        for tok in tokens:
            if tok not in stop_words:
                filtered_tok.append(self.stemmer.stem(tok))
        return filtered_tok

In [16]:
vec_tf = TfidfVectorizer()

y = train_df_sub["obscene"]
X = train_df_sub["comment_text"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

pipe2 = Pipeline([ 
                    #("vect", vec_cv),
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = {"vect__max_features":[100,500,1000,1500],
            "vect__tokenizer":(swTokenizer(stop_words), stemTokenizer(stop_words), lemmaTokenizer(stop_words) ),
            "vect__norm":["l1","l2"]
            }

grid = GridSearchCV(estimator  = pipe2, 
                               param_grid = params, 
                               scoring    = "balanced_accuracy",
                               cv         = 5,
                               n_jobs     =-1)

grid.fit(X_train, y_train.ravel())
best = grid.best_estimator_
preds = best.predict(X_test)
print(best)
print(classification_report(y_test, preds))
sns.heatmap(confusion_matrix(y_test, preds), annot=True)

In [14]:
vec_tf3 = TfidfVectorizer()
model_svc3 = SVC()
 
pipe3 = Pipeline([
                    ("vect", vec_tf3),
                    ("model", model_svc3)
])
 
params3 = {"vect__max_features":[100,500,1000],
            "vect__tokenizer":(swTokenizer(stop_words), stemTokenizer(stop_words), lemmaTokenizer(stop_words) ),
            "vect__norm":["l1","l2"]
            }
 
grid3 = GridSearchCV(estimator  = pipe3,
                               param_grid = params3,
                               scoring    = "balanced_accuracy",
                               cv         = 5,
                               n_jobs     =-1)
 
grid3.fit(X_train, y_train)
best3 = grid3.best_estimator_
preds3 = best3.predict(X_test)
print(best3)
print(classification_report(y_test, preds3))
sns.heatmap(confusion_matrix(y_test, preds3), annot=True)

KeyboardInterrupt: 

## Test Data

In [11]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,comment_text
0,1,Yo bitch Ja Rule is more succesful then you'll...
1,2,== From RfC == \n\n The title is fine as it is...
2,3,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,4,":If you have a look back at the source, the in..."
4,5,I don't anonymously edit articles at all.


## Output Details, Submission Info, and Example Submission

For this project, please output your predictions in a CSV file. The structure of the CSV file should match the structure of the example below. 

The output should contain one row for each row of test data, complete with the columns for ID and each classification.

Into Moodle please submit:
<ul>
<li> Your notebook file(s). I'm not going to run them, just look. 
<li> Your sample submission CSV. This will be evaluated for accuracy against the real labels; only a subset of the predictions will be scored. 
</ul>

It is REALLY, REALLY, REALLY important the the structure of your output matches the specifications. The accuracies will be calculated by a script, and it is expecting a specific format. 

### Sample Evaluator

The file prediction_evaluator.ipynb contains an example scoring function, scoreChecker. This function takes a sumbission and an answer key, loops through, and evaluates the accuracy. You can use this to verify the format of your submission. I'm going to use the same function to evaluate the accuracy of your submission, against the answer key (unless I made some mistake in this counting function).

In [14]:
#Construct dummy data for a sample output. 
#You won't do this part first, you have real data - I'm faking it. 
#Your data should have the same structure, so the CSV output is the same
dummy_ids = ["dfasdf234", "asdfgw43r52", "asdgtawe4", "wqtr215432"]
dummy_toxic = [0,0,0,0]
dummy_severe = [0,0,0,0]
dummy_obscene = [0,1,1,0]
dummy_threat = [0,1,0,1]
dummy_insult = [0,0,1,0]
dummy_ident = [0,1,1,0]
columns = ["id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_out = pd.DataFrame( list(zip(dummy_ids, dummy_toxic, dummy_severe, dummy_obscene, dummy_threat, dummy_insult, dummy_ident)),
                    columns=columns)
sample_out.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,dfasdf234,0,0,0,0,0,0
1,asdfgw43r52,0,0,1,1,0,1
2,asdgtawe4,0,0,1,0,1,1
3,wqtr215432,0,0,0,1,0,0


In [13]:
#Write DF to CSV. Please keep the "out.csv" filename. Moodle will auto-preface it with an identifier when I download it. 
#This command should work with your dataframe of predictions. 
sample_out.to_csv('out.csv', index=False)  

## Grading

The grading for this is split between accuracy and well written code:
<ul>
<li> 75% - Accuracy. The most accurate will get 100% on this, the others will be scaled down from there. 
<li> 25% - Code quality. Can the code be followed and made sense of - i.e. comments, sections, titles. 
</ul>