## Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [2]:
train = pd.read_csv('traink.csv')
test = pd.read_csv('testk.csv')

In [3]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

### Clean the data - lower case, punctuation, numbers, Url removal

In [4]:
# Remove urls train
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
train['message'] = train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

#for test
test['message'] = test['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [5]:
# Make lower case train
train['train'] = train['message'].str.lower()

#for test
test['message'] = test['message'].str.lower()

In [6]:
# Strip out punctuation marks and numerals train
import string
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])
train['message'] = train['message'].apply(remove_punctuation_numbers)

# for test
test['message'] = test['message'].apply(remove_punctuation_numbers)

In [7]:
train.head()


Unnamed: 0,sentiment,message,tweetid,train
0,1,PolySciMajor EPA chief doesnt think carbon dio...,625221,polyscimajor epa chief doesn't think carbon di...
1,1,Its not like we lack evidence of anthropogenic...,126103,it's not like we lack evidence of anthropogeni...
2,2,RT RawStory Researchers say we have three year...,698562,rt @rawstory: researchers say we have three ye...
3,1,TodayinMaker WIRED was a pivotal year in the...,573736,#todayinmaker# wired : 2016 was a pivotal year...
4,1,RT SoyNovioDeTodas Its and a racist sexist cl...,466954,"rt @soynoviodetodas: it's 2016, and a racist, ..."


### Resampling - put in the code here for up/down sample

In [8]:
from sklearn.utils import resample
believe = train[train['sentiment'] == 1]
no_belief = train[train['sentiment'] == -1]
neutral = train[train['sentiment'] == 0]
news = train[train['sentiment'] == 2]

# upsample minority
believe_downsampled = resample(believe, 
                               replace = True, # sample with replacement
                              n_samples = len(believe), # match number of majority class
                              random_state = 27) # reproducable results

no_belief_upsampled = resample(no_belief, 
                               replace = True, # sample with replacement
                              n_samples = 6850, # match number of majority class
                              random_state = 27) # reproducable results

neutral_upsampled = resample(neutral, 
                               replace = True, # sample with replacement
                              n_samples = 6850, # match number of majority class
                              random_state = 27) # reproducable results

news_upsampled = resample(news, 
                               replace = True, # sample with replacement
                              n_samples = 6850, # match number of majority class
                              random_state = 27) # reproducable results

#combine majority and upsampled minorities
upsampled = pd.concat([believe_downsampled, no_belief_upsampled, neutral_upsampled, news_upsampled])


## Splitting out the X variable from the target

In [9]:
y = upsampled['sentiment']
X = upsampled['message']

## Turning text into something your model can read

In [10]:
vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=2, max_df = 0.9)

X_vectorized = vectorizer.fit_transform(X)

## Splitting the training data into a training and validation set

In [11]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.175,shuffle=True, stratify=y, random_state=11)

## Training the model and evaluating using the validation set - Gridsearch, linear

In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
## try logisitic regression - use grid search
##rfc = RandomForestClassifier()
##rfc.fit(X_train, y_train)
##rfc_pred = rfc.predict(X_val)

#parameters = {'kernel':('linear', 'rbf'), 
             # 'C':(.25, 1.0),
              #'gamma': (0, 1)}

svm = SVC()
clf = SVC( kernel = 'linear', C = 1, gamma = 0.0025) #hardcode kernel is linear c=1 and gamma very small to speed model running based on all previous gridsearch returning same
#clf = GridSearchCV(svm, parameter)
clf.fit(X_train,y_train)

y_opt = clf.predict(X_val)
print("The accuracy for our tuned model is: ", accuracy_score(y_val, y_opt))

## Checking the performance of our model on the validation set

In [None]:
f1_score(y_val, y_opt, average="macro")

In [None]:
#clf.best_params_

## Getting our test set ready 

In [None]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [None]:
y_pred = clf.predict(test_vect)

In [None]:
test['sentiment'] = y_pred

In [None]:
test.head()

## Creating an output csv for submission

In [None]:
test[['tweetid','sentiment']].to_csv('kagglesubmission32.csv', index=False)