# Import tweets datasets into pandas dataframe

In [19]:
#Final crawled dataset
import pandas as pd
tweets = pd.read_csv('final_samsung_crawled.csv', sep= ',')
tweets.head()
#len(tweets) #45782

Unnamed: 0,id,date,text,permalink
0,925495649730117632,2017-10-31 23:52,# phone gear Anti Gravity Casing for iPhone an...,https://twitter.com/MyPhoneMyWorld/status/9254...
1,925490587402465280,2017-10-31 23:32,Went to the cell phone repair place and they s...,https://twitter.com/SinamonLance/status/925490...
2,925483268715184128,2017-10-31 23:03,Like and Share if you want this Haunted Mansio...,https://twitter.com/Siresayshop/status/9254832...
3,925476232380145664,2017-10-31 22:35,"Great uses for ""defunct"" phones. The ""reuse"" p...",https://twitter.com/cosmo_nz/status/9254762323...
4,925473978625548289,2017-10-31 22:26,Samsung Galaxy A5 2018 Full Phone Specificatio...,https://twitter.com/entclassblog/status/925473...


In [56]:
#Final annotated dataset from annotation.ipynb
tweets_annoated = pd.read_csv('samsung_annotated_40k.csv') # file should be changed
tweets_annoated.head()

Unnamed: 0,id,text,polarity,polarity_confidence,subjectivity,subjectivity_confidence
0,925495649730117632,# phone gear Anti Gravity Casing for iPhone an...,neutral,0.972387,objective,0.999999
1,925490587402465280,Went to the cell phone repair place and they s...,negative,0.78567,subjective,1.0
2,925483268715184128,Like and Share if you want this Haunted Mansio...,neutral,0.914202,subjective,1.0
3,925476232380145664,"Great uses for ""defunct"" phones. The ""reuse"" p...",neutral,0.664289,objective,0.999453
4,925473978625548289,Samsung Galaxy A5 2018 Full Phone Specificatio...,neutral,0.865111,objective,1.0


# additional features 'polarityNum' and 'brand'

In [57]:
for i in range(0, len(tweets_annoated)):
    if  tweets_annoated.loc[i]['polarity'] == 'positive':
        tweets_annoated.at[i, 'polarityNum'] = 1
    elif tweets_annoated.loc[i]['polarity'] == 'negative':
        tweets_annoated.at[i, 'polarityNum'] = -1
    else:
        tweets_annoated.at[i, 'polarityNum'] = 0

In [58]:
tweets_annoated['brand'] = 'Samsung'

In [59]:
tweets_annoated.head(3)

Unnamed: 0,id,text,polarity,polarity_confidence,subjectivity,subjectivity_confidence,polarityNum,brand
0,925495649730117632,# phone gear Anti Gravity Casing for iPhone an...,neutral,0.972387,objective,0.999999,0.0,Samsung
1,925490587402465280,Went to the cell phone repair place and they s...,negative,0.78567,subjective,1.0,-1.0,Samsung
2,925483268715184128,Like and Share if you want this Haunted Mansio...,neutral,0.914202,subjective,1.0,0.0,Samsung


# Balance the dataset to mininum polarity value

In [60]:
tweets_annoated.polarity.value_counts()

neutral     18696
negative    11670
positive    11098
Name: polarity, dtype: int64

In [61]:
g = tweets_annoated.groupby('polarity')
tweets_balanced = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

In [62]:
tweets_balanced.polarity.value_counts()

positive    11098
negative    11098
neutral     11098
Name: polarity, dtype: int64

In [63]:
tweets_balanced.to_csv('tweets_balanced_samsung.csv', index =False)

In [64]:
tweets_balanced_samsung = pd.read_csv('tweets_balanced_samsung.csv')
tweets_balanced_samsung.head(3)

Unnamed: 0,id,text,polarity,polarity_confidence,subjectivity,subjectivity_confidence,polarityNum,brand
0,971714841315438593,Oh yeah I assumed. For one thing no one could ...,negative,0.998388,subjective,1.0,-1.0,Samsung
1,920490436904607746,But out of the box are Samsung Phone OLED scre...,negative,0.501652,subjective,0.995007,-1.0,Samsung
2,979000635222872064,What Huawei does with some of its phones of la...,negative,0.957417,subjective,1.0,-1.0,Samsung


# Data Preprocessing
# (lower case, word tokenize,stopword removal,punctuation removal, lemmatization)

In [65]:
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
import codecs
import csv
import nltk
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

with codecs.open('samsung_balanced_11098_preprocessed.csv', 'w', 'utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['id', 'text_preprocessed','polarity','polarity_confidence','subjectivity','subjectivity_confidence','polarityNum', 'brand'])
    for i in range(len(tweets_balanced_samsung)):
        text = (tweets_balanced_samsung['text'].astype(str))[i].lower()

        stopwordSet = set(stopwords.words('english'))
        textSplit = [word for word in word_tokenize(text) if word not in stopwordSet]
        noPunc = [word for word in str(textSplit) if word not in string.punctuation]
        words = ''.join(noPunc)
        wordStem = [lemmatizer.lemmatize(word) for word in word_tokenize(words)]
        alphaOnly = [word for word in wordStem if word.isalpha()]
        wordstring = ' '.join(alphaOnly)
        writer.writerow([tweets_balanced_samsung.id[i], wordstring,tweets_balanced_samsung.polarity[i],tweets_balanced_samsung.polarity_confidence[i],tweets_balanced_samsung.subjectivity[i],tweets_balanced_samsung.subjectivity_confidence[i],tweets_balanced_samsung.polarityNum[i],tweets_balanced_samsung.brand[i]]) 

In [66]:
samsung_balanced_preprocessed = pd.read_csv('samsung_balanced_11098_preprocessed.csv')
samsung_balanced_preprocessed.head()

Unnamed: 0,id,text_preprocessed,polarity,polarity_confidence,subjectivity,subjectivity_confidence,polarityNum,brand
0,971714841315438593,oh yeah assumed one thing one could mean merce...,negative,0.998388,subjective,1.0,-1.0,Samsung
1,920490436904607746,box samsung phone oled screen great saturated ...,negative,0.501652,subjective,0.995007,-1.0,Samsung
2,979000635222872064,huawei phone late crazy big brand samsung appl...,negative,0.957417,subjective,1.0,-1.0,Samsung
3,957436019636363264,scale samsung horrible make phone run worse id...,negative,0.99996,subjective,1.0,-1.0,Samsung
4,935117316827750401,m using samsung get stuck time nt mind new raz...,negative,0.798083,subjective,1.0,-1.0,Samsung


# Merge original crawled dataset and preprocssed to get all features

In [67]:
merged_samsung = pd.merge( tweets,samsung_balanced_preprocessed, on='id', how='inner')
merged_samsung.head()

Unnamed: 0,id,date,text,permalink,text_preprocessed,polarity,polarity_confidence,subjectivity,subjectivity_confidence,polarityNum,brand
0,925495649730117632,2017-10-31 23:52,# phone gear Anti Gravity Casing for iPhone an...,https://twitter.com/MyPhoneMyWorld/status/9254...,phone gear anti gravity casing iphone samsung ...,neutral,0.972387,objective,0.999999,0.0,Samsung
1,925490587402465280,2017-10-31 23:32,Went to the cell phone repair place and they s...,https://twitter.com/SinamonLance/status/925490...,went cell phone repair place said d need conta...,negative,0.78567,subjective,1.0,-1.0,Samsung
2,925476232380145664,2017-10-31 22:35,"Great uses for ""defunct"" phones. The ""reuse"" p...",https://twitter.com/cosmo_nz/status/9254762323...,great us defunct phone reuse part reuse reduce...,neutral,0.664289,objective,0.999453,0.0,Samsung
3,925472949150380033,2017-10-31 22:22,"damn, and with Apple breaking their iOS, samsu...",https://twitter.com/mcdroidgame/status/9254729...,damn apple breaking io samsung overloading cra...,negative,0.494093,subjective,0.99985,-1.0,Samsung
4,925472766480076801,2017-10-31 22:21,"rachel: ""theres not someone who owns some appl...",https://twitter.com/cmbxsmile/status/925472766...,rachel there someone owns apple product apple ...,neutral,0.769667,subjective,1.0,0.0,Samsung


In [68]:
merged_samsung.to_csv('samsung_final_14may.csv',index = False)

In [69]:
tweets_samsung = pd.read_csv('samsung_final_14may.csv')
tweets_samsung.head(3)

Unnamed: 0,id,date,text_original,permalink,brand,text_preprocessed,polarity,polarity_confidence,subjectivity,subjectivity_confidence,polarityNum,Checked
0,925495649730117632,2017-10-31 23:52,# phone gear Anti Gravity Casing for iPhone an...,https://twitter.com/MyPhoneMyWorld/status/9254...,Samsung,phone gear anti gravity casing iphone samsung ...,neutral,0.972387,objective,0.999999,0.0,1
1,925490587402465280,2017-10-31 23:32,Went to the cell phone repair place and they s...,https://twitter.com/SinamonLance/status/925490...,Samsung,went cell phone repair place said d need conta...,negative,0.78567,subjective,1.0,-1.0,1
2,925476232380145664,2017-10-31 22:35,"Great uses for ""defunct"" phones. The ""reuse"" p...",https://twitter.com/cosmo_nz/status/9254762323...,Samsung,great us defunct phone reuse part reuse reduce...,neutral,0.664289,objective,0.999453,0.0,1


In [71]:
#same steps are followed for iphone to get the iphone final dataset
tweets_iphone = pd.read_csv('iphone_final_14may.csv')
tweets_iphone.head(3)

Unnamed: 0,id,date,text_original,permalink,text_preprocessed,polarity,polarity_confidence,subjectivity,subjectivity_confidence,polarityNum,brand
0,948570515085713408,2018-01-03 16:03,It's 10 AM and my iPhone 7 battery is already ...,https://twitter.com/TheScottBeach/status/94857...,s iphone battery already minute phone call tol...,negative,0.894631,subjective,1.0,-1.0,iphone
1,945970348436094977,2017-12-27 11:51,Apple launched three phones this year: the bez...,https://twitter.com/Today__Tech/status/9459703...,apple launched three phone year bezelbusting i...,positive,0.492551,subjective,1.0,1.0,iphone
2,951907048035270656,2018-01-12 21:01,Cut one phone completely OFF and my iPhone on ...,https://twitter.com/gods1blessings/status/9519...,cut one phone completely iphone dnd yeah ready...,positive,0.355765,subjective,1.0,1.0,iphone


# Merging of samsung and iphone dataset to get Combined dataset 

In [78]:
combined_final = tweets_samsung.append(tweets_iphone)
combined_final.to_csv('combined_final_14may.csv',index = False)

In [80]:
tweets_combined = pd.read_csv('combined_final_14may.csv')
tweets_combined.head(3)

Unnamed: 0,Checked,brand,date,id,permalink,polarity,polarityNum,polarity_confidence,subjectivity,subjectivity_confidence,text_original,text_preprocessed
0,1.0,Samsung,2017-10-31 23:52,925495649730117632,https://twitter.com/MyPhoneMyWorld/status/9254...,neutral,0.0,0.972387,objective,0.999999,# phone gear Anti Gravity Casing for iPhone an...,phone gear anti gravity casing iphone samsung ...
1,1.0,Samsung,2017-10-31 23:32,925490587402465280,https://twitter.com/SinamonLance/status/925490...,negative,-1.0,0.78567,subjective,1.0,Went to the cell phone repair place and they s...,went cell phone repair place said d need conta...
2,1.0,Samsung,2017-10-31 22:35,925476232380145664,https://twitter.com/cosmo_nz/status/9254762323...,neutral,0.0,0.664289,objective,0.999453,"Great uses for ""defunct"" phones. The ""reuse"" p...",great us defunct phone reuse part reuse reduce...
