In [1]:
import numpy as np
import pandas as pd
import re
import string
import spacy
import ssl
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
path = '/Users/loelee/Courses/CS4248/Project/'

In [3]:
train = pd.read_csv(path+'raw_data/fulltrain.csv',header=None, names=['Label', 'Text'])
train.head()

Unnamed: 0,Label,Text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [4]:
test = pd.read_csv(path+'raw_data/balancedtest.csv',header=None, names=['Label', 'Text'])
test.head()

Unnamed: 0,Label,Text
0,1,When so many actors seem content to churn out ...
1,1,In what football insiders are calling an unex...
2,1,In a freak accident following Game 3 of the N....
3,1,North Koreas official news agency announced to...
4,1,The former Alaska Governor Sarah Palin would b...


## Preprocessing

In [None]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

spacy.cli.download("en_core_web_lg")

In [5]:
nlp = spacy.load('en_core_web_lg')

In [6]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    return text

In [20]:
train_clean = train.copy()
train_clean["Text"] = train_clean["Text"].apply(wordopt)
train_clean.head()

Unnamed: 0,Label,Text
0,1,a little less than a decade ago hockey fans w...
1,1,the writers of the hbo series the sopranos too...
2,1,despite claims from the tv news outlet to offe...
3,1,after receiving subpar service and experienc...
4,1,after watching his beloved seattle mariners pr...


In [6]:
def lemmatize(text):
    text = nlp(text)
    lemmatized = [token.lemma_ for token in text] 
    clean_text = ' '.join(lemmatized)
    return clean_text

In [31]:
from tqdm import tqdm
for i in tqdm(range(len(train_clean))):
    train_clean.loc[i,"Text"] = lemmatize(train_clean.loc[i,"Text"])

train_clean.head()

100%|██████████| 48854/48854 [1:33:14<00:00,  8.73it/s]   


Unnamed: 0,Label,Text
0,1,a little less than a decade ago hockey fan b...
1,1,the writer of the hbo series the soprano take ...
2,1,despite claim from the tv news outlet to offer...
3,1,after receive subpar service and experienc...
4,1,after watch his beloved seattle mariner prevai...


In [32]:
train_clean.to_csv('train_lema.csv',index=False)

In [9]:
test_clean = test.copy()
test_clean["Text"] = test_clean["Text"].apply(wordopt)

from tqdm import tqdm
for i in tqdm(range(len(test_clean))):
    test_clean.loc[i,"Text"] = lemmatize(test_clean.loc[i,"Text"])

test_clean.head()

100%|██████████| 3000/3000 [04:29<00:00, 11.13it/s]


Unnamed: 0,Label,Text
0,1,when so many actor seem content to churn out p...
1,1,in what football insider be call an unexpect...
2,1,in a freak accident follow game of the n b a...
3,1,north koreas official news agency announce tod...
4,1,the former alaska governor sarah palin would b...


In [10]:
test_clean.to_csv('test_lema.csv',index=False)

## Feature Engineering

In [5]:
train_clean = pd.read_csv('train_lema.csv')
test_clean = pd.read_csv('test_lema.csv')

### Feature 1: Word Frequency

In [6]:
# tf-idf
tfidf = TfidfVectorizer(stop_words='english')
train_tfidf = tfidf.fit_transform(train_clean["Text"]).toarray()
train_tfidf.shape

(48854, 195894)

In [7]:
test_tfidf = tfidf.transform(test_clean["Text"]).toarray()

#### Show top10 words by tfidf

In [8]:
# Label 1
sum_tfidf = train_tfidf[train_clean["Label"]==1].sum(axis=0).tolist()
word_tfidf = pd.DataFrame({'word':tfidf.get_feature_names_out(),
                           'tfidf':sum_tfidf})
word_tfidf.sort_values(by='tfidf',ascending=False).head(10)

Unnamed: 0,word,tfidf
150233,say,574.441404
173728,time,266.877323
90067,just,264.940096
1815,add,234.122749
193329,year,231.719403
111449,monday,204.877604
99003,like,193.494229
102888,make,179.180256
122477,old,171.601802
143470,reportedly,171.267907


### Feature 2: count of sentences and words

In [9]:
def sentence_detect(text):
    sent_punct = re.findall('[.!?](?!\w)',text)
    return len(sent_punct)

In [10]:
def normalize(series):
    max_ = max(series)
    min_ = min(series)
    scale = max_ - min_
    return series.apply(lambda x: (x-min_)/scale)

In [11]:
x_train = pd.DataFrame(train_tfidf)

In [12]:
x_train['sent_count'] = train["Text"].apply(sentence_detect)
x_train['sent_count'] = normalize(x_train['sent_count'])

In [13]:
x_train['word_count'] = train_clean["Text"].apply(lambda x: len(x.split()))
x_train['word_count'] = normalize(x_train['word_count'])

In [14]:
x_train['unique_word_count'] = train_clean["Text"].apply(lambda x: len(set(x.split())))
x_train['unique_word_count'] = normalize(x_train['unique_word_count'])

In [15]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,195887,195888,195889,195890,195891,195892,195893,sent_count,word_count,unique_word_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001026,0.00122,0.013249
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000513,0.001012,0.010575
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004615,0.005916,0.043758
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003461,0.006023,0.039747
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000897,0.001485,0.014708


In [16]:
x_test = pd.DataFrame(test_tfidf)

x_test['sent_count'] = test["Text"].apply(sentence_detect)
x_test['sent_count'] = normalize(x_test['sent_count'])

x_test['word_count'] = test_clean["Text"].apply(lambda x: len(x.split()))
x_test['word_count'] = normalize(x_test['word_count'])

x_test['unique_word_count'] = test_clean["Text"].apply(lambda x: len(set(x.split())))
x_test['unique_word_count'] = normalize(x_test['unique_word_count'])

### Base line models

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [18]:
y_train = train["Label"]
y_test = test["Label"]

In [19]:
mnb = MultinomialNB(alpha=0.01)
mnb.fit(x_train.values,y_train)

In [20]:
# train performance
mnb_pred = mnb.predict(x_train.values)
print(metrics.classification_report(y_train,mnb_pred))

              precision    recall  f1-score   support

           1       0.94      0.96      0.95     14047
           2       0.98      0.95      0.97      6942
           3       0.96      0.99      0.98     17870
           4       0.99      0.91      0.95      9995

    accuracy                           0.96     48854
   macro avg       0.97      0.96      0.96     48854
weighted avg       0.96      0.96      0.96     48854



In [21]:
# test performance
mnb_pred = mnb.predict(x_test.values)
metrics.f1_score(y_test,mnb_pred,average='micro')

0.6706666666666666

In [22]:
print(metrics.classification_report(y_test,mnb_pred))

              precision    recall  f1-score   support

           1       0.63      0.63      0.63       750
           2       0.67      0.42      0.52       750
           3       0.57      0.93      0.71       750
           4       0.93      0.71      0.80       750

    accuracy                           0.67      3000
   macro avg       0.70      0.67      0.66      3000
weighted avg       0.70      0.67      0.66      3000

