In [25]:
import pandas as pd
tweets = pd.read_csv('Samsung_final_brand_cleaned_subset_annotated_complete_finl.csv', sep=',')
tweets.head()
#display(len(data))

Unnamed: 0,id,text,polarity,polarity_confidence,subjectivity,subjectivity_confidence
0,930830962757455000,Any phone with a lot of gb and available for a...,positive,0.578184127807617,subjective,1
1,977958263269474000,@ SamsungMobileIN @ DoT_India New Samsung OS u...,negative,0.937839031219482,subjective,1
2,968617998096457000,Samsung? No they aren't and any thoughts of th...,negative,0.897110342979431,subjective,1
3,978014137262465000,I didn’t mean that to neglect the people on bu...,neutral,0.43565747141838,subjective,1
4,915900425408270000,@ DocThompsonShow @ TheBlazeKeith # WhatILearn...,positive,0.799633145332336,subjective,1


In [26]:
tweets.polarity.value_counts()

neutral     5581
negative    3506
positive    3320
polarity       3
Name: polarity, dtype: int64

In [28]:
tweets.drop(['id','polarity_confidence','subjectivity_confidence','subjectivity'],axis=1,inplace=True)

In [9]:
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
import codecs
import csv
import nltk
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

with codecs.open('sentiment_samsung_preprocessed.csv', 'w', 'utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['id', 'text','polarity'])
    for i in range(len(tweets)):
        text = (tweets['text'].astype(str))[i].lower()

        textSplit = [word for word in word_tokenize(text) if word not in set(stopwords.words('english'))]
        noPunc = [word for word in str(textSplit) if word not in string.punctuation]
        tokens = ''.join(noPunc)
        #alpha = re.sub("[^a-zA-Z]", ' ',str(textSplit))
        wordStem = [word for word in lem.lemmatize(tokens)]
        wordString = ''.join(wordStem)
        writer.writerow([tweets.id[i], wordString,tweets.polarity[i]]) 

In [47]:
tweets_preprocessed = pd.read_csv('sentiment_samsung_preprocessed.csv')
tweets_preprocessed.head(3)

Unnamed: 0,id,text,polarity
0,930830962757455000,phone lot gb available carrier prefer samsung...,positive
1,977958263269474000,samsungmobilein dotindia new samsung os upgr...,negative
2,968617998096457000,samsung nt thoughts happening either pure spe...,negative


In [48]:
for i in range(0, len(tweets_preprocessed)):
    if  tweets_preprocessed.loc[i]['polarity'] == 'positive':
        tweets_preprocessed.at[i, 'sentiment'] = 1
    elif tweets_preprocessed.loc[i]['polarity'] == 'negative':
        tweets_preprocessed.at[i, 'sentiment'] = -1
    else:
        tweets_preprocessed.at[i, 'sentiment'] = 0

In [60]:
tweets_preprocessed.head()

Unnamed: 0,id,text,polarity,sentiment
0,930830962757455000,phone lot gb available carrier prefer samsung...,positive,1.0
1,977958263269474000,samsungmobilein dotindia new samsung os upgr...,negative,-1.0
2,968617998096457000,samsung nt thoughts happening either pure spe...,negative,-1.0
3,978014137262465000,’ mean neglect people referring innovation ke...,neutral,0.0
4,915900425408270000,docthompsonshow theblazekeith whatilearnedt...,positive,1.0


In [85]:
tweets_preprocessed_posNeg =tweets_preprocessed[(tweets_preprocessed.sentiment == 1.0) | (tweets_preprocessed.sentiment == -1.0)]

In [86]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

In [87]:
train, test = train_test_split(tweets_preprocessed_posNeg, test_size=0.2, random_state=1)
X_train = train['text'].values
X_test = test['text'].values
y_train = train['sentiment']
y_test = test['sentiment']

In [88]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = True,
    ngram_range=(1, 1))

In [89]:
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [90]:
import numpy as np
np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer, 
                            SVC(probability=True, kernel="linear", class_weight="balanced"))

grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {'svc__C': [0.01, 0.1, 1]}, 
                    cv = kfolds,
                    scoring="roc_auc",
                    verbose=1,   
                    n_jobs=-1) 

grid_svm.fit(X_train, y_train)
grid_svm.score(X_test, y_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.8min finished


0.8791049705746101

In [68]:
grid_svm.best_params_

{'svc__C': 0.1}

In [69]:
grid_svm.best_score_

0.8829776512982763

In [70]:
def report_results(model, X, y):
    pred_proba = model.predict_proba(X)[:, 1]
    pred = model.predict(X)        

    auc = roc_auc_score(y, pred_proba)
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    result = {'auc': auc, 'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result

In [71]:
report_results(grid_svm.best_estimator_, X_test, y_test)

{'auc': 0.8790985364514524,
 'f1': 0.7962962962962963,
 'acc': 0.8067349926793558,
 'precision': 0.821656050955414,
 'recall': 0.7724550898203593}

In [103]:
grid_svm.predict(["with samsung is always a great experience"])

array([1.])

In [104]:
grid_svm.predict(["worst experience"])

array([-1.])

In [107]:
grid_svm.predict_proba(["Any phone with a lot of gb and available for any carrier, but I prefer Samsung! I know you will do the best of the best in whatever you choose"])[-1][1]

0.8982222090632392