# Twitter Sentiment Analysis

# 01 :Frame the Problem

#### Problem Statement Link :  https://datahack.analyticsvidhya.com/contest/practice-problem-twitter-sentiment-analysis/

# 02 :Obtain Data

### Import Statements

In [0]:
!mkdir twitter
%cd twitter
!ls


In [0]:
!pip install missingno
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as ms
% matplotlib inline




### Reading the Train Data

In [0]:
!wget https://www.dropbox.com/s/p8fq1p6wan2g89a/train.csv -q

In [0]:
!ls -l

In [0]:
train = pd.read_csv('train.csv')
train.info()

In [0]:
pd.set_option('max_colwidth', 240)

In [0]:
train.head()

# 03 : Analyze Data

In [0]:
train.head(20)

In [0]:
train.iloc[13]

In [0]:
train['label'].value_counts()

In [0]:
train[train['label']==1]['tweet'].head()

## Label types
-   0 : Normal
-   1 : Hate

# 05 : Model Selection ( 1st Iteration)

## RandomForest without Preprocessing of Text Data

In [0]:
#Building the model without preprocessing of data
unprocessed_data = pd.read_csv('train.csv')

In [0]:
from sklearn.model_selection import train_test_split


#splitting the data into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(unprocessed_data["tweet"],
                                                        unprocessed_data["label"], 
                                                    test_size = 0.2, random_state = 42)

In [0]:
# Sequentialization of tasks
from sklearn.pipeline import Pipeline

#generating ngrams and tokens and Bagging
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                      ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=50)),])

In [0]:
model = text_clf.fit(X_train,y_train)

In [0]:
predicted = model.predict(X_test)

In [0]:
from sklearn.metrics import precision_score,recall_score,f1_score, accuracy_score, confusion_matrix

In [0]:
confusion_matrix(y_test,predicted)

In [0]:
accuracy_score(y_test,predicted)

In [0]:
precision_score(y_test,predicted)

In [0]:
recall_score(y_test,predicted)

In [0]:
f1_score(y_test,predicted)

# 04 and 05 : Feature Engineering and Model Selection (2nd Iteration)

Preprocessing of Text data is very important for Textual Analysis. Tokenization, Feature Extraction (Vectorization) are the most important techniques in Scikit-Learn. 
The text must be parsed to extract words, called tokenization. Then the words need to be encoded as integers or floating point values for use as input to a machine learning algorithm, called feature extraction (or vectorization).


In [0]:
#regular expression 
import re 

#regular expression for the removal of name tags and the emoticons from tweets.
def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())

In [0]:
#Dropping of columns from pd
def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)

In [0]:
#Applying the Process_tweet function to the given Train Data
train['processed_tweets'] = train['tweet'].apply(process_tweet)

In [0]:
train.head()

In [0]:
train[train['label']==1].head(20)

In [0]:
drop_features(['id','tweet'],train)

In [0]:
train.head()

In [0]:
#splitting the data into random train and test subsets
x_train, x_test, y_train, y_test = train_test_split(train["processed_tweets"],train["label"],
                                                    test_size = 0.2, random_state = 42)

Pipeline : Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. The final estimator only needs to implement fit. 

In [0]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                      ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=200)),])
text = text_clf.fit(x_train,y_train)

In [0]:
predicted = text.predict(x_test)

In [0]:
from sklearn.metrics import confusion_matrix, classification_report,precision_score

In [0]:
cm_m = confusion_matrix(y_test,predicted)
cm_m

In [0]:
TN, FP = cm_m[0]
FN, TP = cm_m[1]

In [0]:
TP

In [0]:
float(TN+TP)/(TN+TP+FN+FP)

In [0]:
p = TP/(TP+FP)
p

In [0]:
precision_score(y_test,predicted)

In [0]:
r = TP/(FN+TP)
r

In [0]:
recall_score(y_test,predicted)

In [0]:
f1 = 2*p*r/(p+r)
f1

In [0]:
f1_score(y_test,predicted)

# 04 and 05 : Feature Engineering and Model Selection (3rd Iteration)

In [0]:

count_vect = CountVectorizer(stop_words='english',ngram_range=(1,3),analyzer='word')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

In [0]:
#splitting the data into random train and test subsets
x_train, x_test, y_train, y_test = train_test_split(train["processed_tweets"],train["label"],
                                                    test_size = 0.2, random_state = 42)

x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

In [0]:
print(x_train_counts.shape)
print(x_train_tfidf.shape)
print(x_test_counts.shape)
print(x_test_tfidf.shape)

In [0]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(loss="modified_huber", penalty="l1")
model.fit(x_train_tfidf,y_train)
predictions = model.predict(x_test_tfidf)

In [0]:
f1_score(y_test,predictions)

In [0]:
recall_score(y_test,predictions)

In [0]:
precision_score(y_test,predictions)

In [0]:
f1_score(y_test,predictions)

# 05 : Model Selection

In [0]:
#different classification modesls being used
from sklearn.svm import LinearSVC

model_svc = LinearSVC(C=2.0,max_iter=500,tol=0.0001,loss ='hinge')
model_svc.fit(x_train_counts,y_train)

In [0]:
predict_svc = model_svc.predict(x_test_counts)

In [0]:
f1_score(y_test,predict_svc)

In [0]:
recall_score(y_test,predict_svc)

# 06 : Tune the Model

In [0]:

#optimizing parameters
from sklearn.model_selection import GridSearchCV


params = {"tfidf__ngram_range": [(1, 2), (1,3), (1,4)],
          "svc__C": [.01, .1, 1, 10, 100]}

clf = Pipeline([("tfidf", TfidfVectorizer(sublinear_tf=True)),
                ("svc", LinearSVC(loss='hinge'))])

gs = GridSearchCV(clf, params, verbose=4, n_jobs=-1)
gs.fit(x_train,y_train)
print("Best Estimator = ", gs.best_estimator_)
print("Best Score = ",gs.best_score_)

In [0]:
predicted = gs.predict(x_test)


In [0]:
predicted

In [0]:
f1_score(y_test,predicted)

In [0]:
recall_score(y_test,predicted)

In [0]:
precision_score(y_test,predicted)

# 07 : Predict on new cases

In [0]:
!wget https://www.dropbox.com/s/as2y6lpjsh6284l/test.csv

In [0]:
submission = pd.read_csv('test.csv')
submission.info()

In [0]:
submission['processed_tweet'] = submission['tweet'].apply(process_tweet)

In [0]:
submission.head()

In [0]:
drop_features(['tweet'],submission)

In [0]:
submission.head()

In [0]:
predicted = gs.predict(submission['processed_tweet'])

In [0]:
predicted

In [0]:
final_predict = pd.DataFrame(predicted,columns=['label'])
result = pd.DataFrame(submission['id'],columns=['id'])
result = pd.concat([result,final_predict],axis=1)
result.to_csv('final_predictions.csv',index=False)

In [0]:
result['label'].value_counts()