In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

## Training Data

In [2]:
train = pd.read_csv("train.csv", header=None, encoding='latin-1')

In [3]:
train.tail()

Unnamed: 0,0,1,2,3,4,5
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [4]:
# Function to remove special characters such as `@`, `-`, `?`, etc.
# from the text and returning lowercase processed text.

import string
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def preprocess_tweets(tweet):
    
    regex = re.compile('[%s]|\d+' % re.escape(string.punctuation))
    tweet = regex.sub(' ', tweet.lower())
    stemmer = WordNetLemmatizer()
    token_words = word_tokenize(tweet)
    return " ".join([stemmer.lemmatize(token_words[i]) for i in range(len(token_words))])
    

In [5]:
X = train.iloc[:, 5].apply(preprocess_tweets)
y = train[[0]]
# Replacing class==`4` with `1`
y = np.where(y==4, 1, 0)

In [6]:
# import nltk
# nltk.download('wordnet')

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True, ngram_range=(1, 2))
X = cv.fit_transform(X)

In [10]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=27)

# Looping over possible values for parameter `C`

for c in [0.01, 0.05, 0.25, 0.5, 1]:    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print("Accuracy: %s"
          %(accuracy_score(y_val, svm.predict(X_val))))

Accuracy: 0.8211525
Accuracy: 0.821415
Accuracy: 0.8136625
Accuracy: 0.80862
Accuracy: 0.8036375


## Testing Data

In [None]:
test = pd.read_csv('test.csv', header=None, encoding='latin-1')
test.head()

In [None]:
# X_test = test[[5]]

In [None]:
test.iloc[:, 0].hist()

In [None]:
# filtering out data points of class==`2` i.e. 'neutral'

neutral_test_data = test[test.iloc[:, 0] != 2]

In [None]:
X_test = neutral_test_data.iloc[:, 5].apply(preprocess_tweets)
y_test = neutral_test_data[[0]]

y_test[y_test==4] = 1

In [None]:
X_test = cv.transform(X_test)

In [None]:
svm = LinearSVC()
svm.fit(X, y)

In [None]:
print("Accuracy: %s"%(accuracy_score(y_test, svm.predict(X_test))))

In [None]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test, svm.predict(X_test))
print(matrix)

In [None]:
print("Precision: %s \nRecall:    %s"%(matrix[1][1]/(matrix[1][1]+matrix[1][0]), matrix[1][1]/(matrix[1][1]+matrix[0][1])))