In [26]:
#from cleanTweet import cleaningTweet  ---> OLD
from lemmatizationText import computeLemmatizationText

import pandas as pd
import numpy as np
#-------------------------- processing ------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# hyper-parameters tuning
from sklearn.model_selection import GridSearchCV

# classifiers
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
tweetTesting = "\U0001F633 Apple 2021 @Apple “@Luigi “@Market update, we ’ got her back, so ’ ’ going to try and help her understand that she can go out, come back and be safe. https://t.co/Uyef7y0gxf"

cleanedTweet = cleaningTweet(tweetTesting)

In [3]:
print(cleanedTweet)

apple update we get her back so go to try and help her understand that she can go out come back and be safe


In [9]:
#dataset = pd.read_csv("./sample.csv")
#dataset.dropna(subset=["text","tag"], inplace=True)

dataset = pd.read_csv("./apple_tweets_for_modelling")
dataset.dropna(subset=["Tweet Text","tag"], inplace=True)

In [10]:
dataset.head()

Unnamed: 0,index,Tweet Datetime,Tweet Text,tag
0,1,2021-08-01 05:59:15,And still it too expensive and most people won...,negative
1,2,2021-08-01 06:22:23,Apple unpacked best phone the iPhone12 z flip,positive
2,3,2021-08-01 09:00:30,especially because you can trick ur Apple out ...,neutral
3,4,2021-08-01 09:43:12,going to jump ship to iOSthat new Apple has a ...,positive
4,5,2021-08-01 10:46:17,just took off my phone screen protector that c...,positive


In [11]:
# label coding 
tag_codes = {
    "positive" : 1, 
    "negative" : 0,
    "neutral" : -1
}

# category mapping
dataset["tag_code"] = dataset["tag"]
dataset = dataset.replace({"tag_code" :tag_codes})

# labels set    
labels = dataset["tag_code"]

In [27]:
lemmatized_set = []
column_name = "Tweet Text"

lemmatized_set = computeLemmatizationText(dataset,column_name)


print(lemmatized_set[0])

print(lemmatized_set[1])

print(lemmatized_set[2])

and still it too expensive and most people wont ever use that big screen for the productivity apple be talk about they only mention that to make us imagine what incredible thing wed be able to do if we buy that phone
apple unpacked best phone the iphone z flip
especially because you can trick ur apple out and the color be pretty plus so many cool featuresit really your device


In [13]:
lemmatized_set = []
key_name = "Tweet Text"
for row_index in dataset.index:
    row_field = dataset.loc[row_index, key_name]

    filtered_tokens = cleaningTweet(row_field)

    # lemmatization
    lemmatized_set.append(filtered_tokens)

print(lemmatized_set[0])

print(lemmatized_set[1])

print(lemmatized_set[2])

and still it too expensive and most people wont ever use that big screen for the productivity apple be talk about they only mention that to make us imagine what incredible thing wed be able to do if we buy that phone
apple unpacked best phone the iphone z flip
especially because you can trick ur apple out and the color be pretty plus so many cool featuresit really your device


In [14]:
for i in range(0,30):
    print(lemmatized_set[i])

and still it too expensive and most people wont ever use that big screen for the productivity apple be talk about they only mention that to make us imagine what incredible thing wed be able to do if we buy that phone
apple unpacked best phone the iphone z flip
especially because you can trick ur apple out and the color be pretty plus so many cool featuresit really your device
go to jump ship to iosthat new apple have a megapixel camera and i need it
just take off my phone screen protector that come preinstalled by apple and it turn out the glass i think have chip be actually just screen protector damage i think i be go to have to pay £ for a new screen thank you jesus thank you apple
would anyone want to purchase a apple k screen before i toss it on the public scam place absolutely zero issue buy it roughly a year ago im try to upgrade be all wife need some motivation i have ton of screenshots if need
yeah so all you need be a new phone make by a different company like apple
wait why b

In [19]:
#Processing
num_features = 500

# Bag Of Words - uni-grams 
BOW_uni_svm_pipe = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('fselect', SelectKBest(chi2, k=num_features)),
        ('clf', svm.SVC()),
        ])

scoring = {'accuracy' : make_scorer(accuracy_score), 
            'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
            'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
            'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


In [20]:
# store results
results = []
results.append(cross_validate(estimator= BOW_uni_svm_pipe,
                    X=lemmatized_set,
                    y=labels,
                    cv=10,
                    scoring=scoring
                    )) 

In [21]:
#append result
tmp_svm = {}
tmp_svm["Num Features"] = num_features
tmp_svm["BOW uni-grams Accuracy"] = np.mean(results[0]['test_accuracy'])
tmp_svm["BOW uni-grams Precision"] = np.mean(results[0]['test_precision'])
tmp_svm["BOW uni-grams Recall"] = np.mean(results[0]['test_recall'])
tmp_svm["BOW uni-grams F1 Score"] = np.mean(results[0]['test_f1_score'])

print(results[0]['test_accuracy'])
print(results[0]['test_precision'])
print(results[0]['test_recall'])
print(results[0]['test_f1_score'])
print(tmp_svm)

[0.58798283 0.64806867 0.68240343 0.69957082 0.74678112 0.71244635
 0.7639485  0.72532189 0.76724138 0.69396552]
[0.59151761 0.65405867 0.68702646 0.70508679 0.74745081 0.71859509
 0.76606053 0.72793478 0.77293857 0.69609012]
[0.58246167 0.64192578 0.68261929 0.69529468 0.74768159 0.7113368
 0.76370373 0.72856087 0.76760766 0.69159748]
[0.58695471 0.64793543 0.68481578 0.7001565  0.74756618 0.71494752
 0.76488031 0.72824769 0.77026389 0.69383653]
{'Num Features': 500, 'BOW uni-grams Accuracy': 0.7027730501701939, 'BOW uni-grams Precision': 0.7066759412405722, 'BOW uni-grams Recall': 0.701278953077531, 'BOW uni-grams F1 Score': 0.7039604538457571}
