In [10]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [11]:
train_df = pd.read_csv("data/preprocessed/train_new.csv", index_col=None)
dev_df = pd.read_csv("data/preprocessed/dev_new.csv", index_col=None)
print(len(train_df), len(dev_df))

40377 543


In [12]:
train_df = train_df.dropna()
dev_df = dev_df.dropna()

In [13]:
print(len(train_df), len(dev_df))
train_df.info()

40313 543
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40313 entries, 0 to 40376
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  40313 non-null  int64 
 1   Text        40313 non-null  object
 2   Target      40313 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [14]:
train_df['Text'] == train_df['Text'].astype(str)
dev_df['Text'] == dev_df['Text'].astype(str)

0      True
1      True
2      True
3      True
4      True
       ... 
538    True
539    True
540    True
541    True
542    True
Name: Text, Length: 543, dtype: bool

In [15]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,Text,Target
0,0,i am getting on borderlands and i will murder ...,Positive
1,1,i am coming to the borders and i will kill you...,Positive
2,2,i am getting on borderlands and i will kill yo...,Positive
3,3,i am coming on borderlands and i will murder y...,Positive
4,4,i am getting on borderlands 2 and i will murde...,Positive


# construct dataset

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
def whitespace_tokenizer(text: str):
    return text.split()


train_texts = train_df['Text']
train_labels = train_df['Target']

dev_texts = dev_df['Text']
dev_labels = dev_df['Target']

In [18]:
# get tf-idf vectors
tfidf_vectorizer = TfidfVectorizer(tokenizer=whitespace_tokenizer)
train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
dev_tfidf = tfidf_vectorizer.transform(dev_texts)

# Models

In [19]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

## LinearSVC

In [20]:
for c in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    lsvc = LinearSVC(C=c)
    lsvc.fit(train_tfidf, train_labels)
    preds = lsvc.predict(dev_tfidf)
    print(f"C={c:6}, acc: {accuracy_score(dev_labels, preds):.3f}")

C= 0.001, acc: 0.772
C=  0.01, acc: 0.882
C=   0.1, acc: 0.952
C=     1, acc: 0.980
C=    10, acc: 0.976
C=   100, acc: 0.974
C=  1000, acc: 0.972


## Logistic Regression

In [21]:
for c in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    lr = LogisticRegression(C=c)
    lr.fit(train_tfidf, train_labels)
    preds = lr.predict(dev_tfidf)
    print(f"C={c:6}, acc: {accuracy_score(dev_labels, preds):.3f}")

C= 0.001, acc: 0.497
C=  0.01, acc: 0.786
C=   0.1, acc: 0.882
C=     1, acc: 0.950
C=    10, acc: 0.976
C=   100, acc: 0.982
C=  1000, acc: 0.980


## Naive Bayes

In [22]:
for a in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    nb = MultinomialNB(alpha=a)
    nb.fit(train_tfidf, train_labels)
    preds = nb.predict(dev_tfidf)
    print(f"alpha={a: 6}, acc: {accuracy_score(dev_labels, preds):.3f}")

alpha= 0.001, acc: 0.941
alpha=  0.01, acc: 0.939
alpha=   0.1, acc: 0.941
alpha=     1, acc: 0.943
alpha=    10, acc: 0.888
alpha=   100, acc: 0.729
alpha=  1000, acc: 0.505


## KNN

In [23]:
for n_neighbors in range(1, 10):
    for weights in ['uniform', 'distance']:
        knn = KNeighborsClassifier(n_neighbors=n_neighbors)
        knn.fit(train_tfidf, train_labels)
        preds = knn.predict(dev_tfidf)
        print(f"n_neighbors={n_neighbors: 2}, weights={weights:9}, acc: {accuracy_score(dev_labels, preds):.3f}")

n_neighbors= 1, weights=uniform  , acc: 0.991
n_neighbors= 1, weights=distance , acc: 0.991
n_neighbors= 2, weights=uniform  , acc: 0.993
n_neighbors= 2, weights=distance , acc: 0.993
n_neighbors= 3, weights=uniform  , acc: 0.989
n_neighbors= 3, weights=distance , acc: 0.989
n_neighbors= 4, weights=uniform  , acc: 0.989
n_neighbors= 4, weights=distance , acc: 0.989
n_neighbors= 5, weights=uniform  , acc: 0.982
n_neighbors= 5, weights=distance , acc: 0.982
n_neighbors= 6, weights=uniform  , acc: 0.987
n_neighbors= 6, weights=distance , acc: 0.987
n_neighbors= 7, weights=uniform  , acc: 0.983
n_neighbors= 7, weights=distance , acc: 0.983
n_neighbors= 8, weights=uniform  , acc: 0.982
n_neighbors= 8, weights=distance , acc: 0.982
n_neighbors= 9, weights=uniform  , acc: 0.971
n_neighbors= 9, weights=distance , acc: 0.971
