In [1]:
import pandas as pd
import numpy as np  
import re 
import string

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

In [4]:
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/theDoctor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

In [6]:
def load_dataset(filepath):
    df = pd.read_csv(filepath, encoding='latin-1')
    df.columns = ["text","sentiment"]
    return df

In [7]:

def load_text_data(filename, append = False):
    if not append:
        data = []
        
    textfile = open(filename, "r")
    for tweet in textfile:
        parts = tweet.split(',')
        if len(parts) > 2:
            newTweet = [parts[0],parts[1], 0, "".join(parts[3:])]
            data.append(newTweet)
    textfile.close()
    df = pd.DataFrame(data)
    df.columns = ["User", "Date", "Retweet", "text"]
    return df


In [8]:
def delete_redundant_cols(df, cols):
    for col in cols:
        del df[col]
        return df

In [15]:
def preprocess_tweet_text(tweet):

    tweet = tweet.lower()

    tweet = re.sub(r"http\S+|www\S+|https\S+","",tweet, flags=re.MULTILINE)   

    tweet = tweet.translate(str.maketrans("","",string.punctuation))

    tweet = re.sub(r'\@\w+|\#',"",tweet)

    tweet_tokens = word_tokenize(tweet)
    filtered_words = [word for word in tweet_tokens if word not in stop_words]

#     ps = PorterStemmer()
#     stemmed_words = [ps.stem(w) for w in filtered_words]

    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in filtered_words]

    return " ".join(lemma_words)
preprocess_tweet_text("Hi there, how are you preparing for your exams?")    

'hi preparing exams'

In [10]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [11]:
def int_to_string(sentiment):
    if sentiment < -0.1:
        return "Negative"
    elif sentiment < 0.1:
        return "Neutral"
    else:
        return "Positive"

In [59]:
# Load dataset
dataset = load_text_data("datasets/dataset 2021090613_29_33.txt")
# Remove unwanted columns from dataset
# n_dataset = remove_unwanted_cols(dataset, ['t_id', 'created_at', 'query', 'user'])
#Preprocess data
# dataset["cleanText"] = dataset['text']
dataset["cleanText"] = dataset['text'].apply(preprocess_tweet_text)




# determine the sentiment of the tweet using TextBlob and use that result to train the model
sent = []
for x in dataset['cleanText']:
    
    
    
    blob_object = TextBlob(x, analyzer=NaiveBayesAnalyzer())
    
#    Sentiment(classification='pos', p_pos=0.5057908299783777, p_neg=0.49420917002162196)
    
    
    analysis = blob_object.sentiment
    
    c = analysis.classification
    p = analysis.p_pos
    n = analysis.p_neg
    
    print(analysis)
    
    if p < 0.4:
        r = -1
    elif p > 0.6:
        r = 1
    else:
        r = 0
    sent.append(r) 
    
#create the sentiment column
dataset["sentiment"] = sent

Sentiment(classification='neg', p_pos=0.061848992071306515, p_neg=0.9381510079286948)
Sentiment(classification='pos', p_pos=0.8324124318829695, p_neg=0.1675875681170304)
Sentiment(classification='neg', p_pos=0.3947368421052628, p_neg=0.6052631578947371)
Sentiment(classification='pos', p_pos=0.6571729015199701, p_neg=0.34282709848003085)
Sentiment(classification='pos', p_pos=0.697496358192723, p_neg=0.30250364180727796)
Sentiment(classification='pos', p_pos=0.84646178324069, p_neg=0.1535382167593091)
Sentiment(classification='neg', p_pos=0.46076328060400257, p_neg=0.5392367193959956)
Sentiment(classification='pos', p_pos=0.5387783635019395, p_neg=0.4612216364980616)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.8832313000416809, p_neg=0.11676869995831875)
Sentiment(classification='pos', p_pos=0.5953133228014746, p_neg=0.4046866771985255)
Sentiment(classification='pos', p_pos=0.7366632825412875, p_neg=0.26333671745871584)
Sentiment(classifi

In [78]:
# Split dataset into Train, Test

# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(dataset['cleanText']).ravel())
X = tf_vector.transform(np.array(dataset['cleanText']).ravel())
y = np.array(dataset['sentiment']).ravel()
# y = y.astype('float')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

0.45454545454545453
0.45454545454545453


In [44]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   User       52 non-null     object
 1   Date       52 non-null     object
 2   Retweet    52 non-null     int64 
 3   text       52 non-null     object
 4   cleanText  52 non-null     object
 5   sentiment  52 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 2.6+ KB


In [86]:
dataset[dataset['sentiment'] < 0]

Unnamed: 0,User,Date,Retweet,text,cleanText,sentiment
0,emojizedcom,Mon Sep 06 18:29:15 +0000 2021,0,Monotonectally Mesh Intuitive Manufactured #P...,Monotonectally Mesh Intuitive Manufactured #P...,-1
2,CurtisSChin,Mon Sep 06 18:28:52 +0000 2021,0,#USA 🇺🇸\n,#USA 🇺🇸\n,-1
24,RLTraveler,Mon Sep 06 18:16:19 +0000 2021,0,Masked? 🤔😷 \n,Masked? 🤔😷 \n,-1
29,clayandbuck,Mon Sep 06 18:14:27 +0000 2021,0,"""What does the #Biden administration have to ...","""What does the #Biden administration have to ...",-1
30,DickMorrisTweet,Mon Sep 06 18:13:31 +0000 2021,0,People Think Better Of Trump As They Get To K...,People Think Better Of Trump As They Get To K...,-1
31,wessas68,Mon Sep 06 18:12:40 +0000 2021,0,@ShaniaTwain I’ll be there if #biden let’s us...,@ShaniaTwain I’ll be there if #biden let’s us...,-1
38,God4HopeFaith,Mon Sep 06 18:08:50 +0000 2021,0,@ThatOldPimp @chrislhayes #AfghanistanCrisis ...,@ThatOldPimp @chrislhayes #AfghanistanCrisis ...,-1
39,leylaboulton,Mon Sep 06 18:04:40 +0000 2021,0,Let’s hope the #Biden administration can do m...,Let’s hope the #Biden administration can do m...,-1
42,Zoetnet,Mon Sep 06 18:01:44 +0000 2021,0,FLASHBACK: 4 Years Ago #Trump Warned Against ...,FLASHBACK: 4 Years Ago #Trump Warned Against ...,-1
46,suiwazear,Mon Sep 06 17:59:23 +0000 2021,0,#Fauci #biden #who #unicef #cdc #aphq #carter...,#Fauci #biden #who #unicef #cdc #aphq #carter...,-1


In [87]:
dataset[dataset['sentiment'] > 0]

Unnamed: 0,User,Date,Retweet,text,cleanText,sentiment
1,global_police,Mon Sep 06 18:29:09 +0000 2021,0,@TheEconomist Good it's such a wonderful time...,@TheEconomist Good it's such a wonderful time...,1
3,patriotlady76,Mon Sep 06 18:27:45 +0000 2021,0,#State department is stopping flights for Ame...,#State department is stopping flights for Ame...,1
4,two001snake,Mon Sep 06 18:26:37 +0000 2021,0,ALL #Corrupt @USCongress #FakeNews #MSM #TERR...,ALL #Corrupt @USCongress #FakeNews #MSM #TERR...,1
5,JoanneSpruceC21,Mon Sep 06 18:25:16 +0000 2021,0,"""Biden directs federal aid to NY NJ after dea...","""Biden directs federal aid to NY NJ after dea...",1
9,shepersists2,Mon Sep 06 18:23:54 +0000 2021,0,I am anti war but the way Americans left Afgh...,I am anti war but the way Americans left Afgh...,1
11,BoesenA,Mon Sep 06 18:22:50 +0000 2021,0,Americans held hostage at Afghan Airport as R...,Americans held hostage at Afghan Airport as R...,1
14,JamesLiskutin,Mon Sep 06 18:21:57 +0000 2021,0,'Traditional' Muslim rules enforced after Tal...,'Traditional' Muslim rules enforced after Tal...,1
16,canine2,Mon Sep 06 18:20:44 +0000 2021,0,We didn’t enter #Afghanistan for a 20 year ex...,We didn’t enter #Afghanistan for a 20 year ex...,1
18,SaintlySicilian,Mon Sep 06 18:20:32 +0000 2021,0,I know that in certain countries (North Korea...,I know that in certain countries (North Korea...,1
20,JohnKevinLucke1,Mon Sep 06 18:18:49 +0000 2021,0,Daily US COVID-19 infections up more than 300...,Daily US COVID-19 infections up more than 300...,1


In [50]:
y_train

array([0.375     , 0.54305869, 0.27878755, 0.8832313 , 0.51688715,
       0.84646178, 0.58648649, 0.75393727, 0.31773619, 0.5       ,
       0.97727518, 0.99700406, 0.5       , 0.75      , 0.94614259,
       0.5       , 0.69749636, 0.25452716, 0.5       , 0.73666328,
       0.11503524, 0.06184899, 0.828164  , 0.46076328, 0.750341  ,
       0.58834717, 0.39184758, 0.98545998, 0.95244897, 0.83241243,
       0.53877836, 0.96777607, 0.6571729 , 0.5       , 0.39473684,
       0.50433723, 0.5       , 0.93912194, 0.89350569, 0.75222828,
       0.7907548 ])

In [69]:
from sqlalchemy import create_engine  
from sqlalchemy import Table, Column, String, MetaData, Integer
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
from sqlalchemy.orm import Session


db_string = 'postgresql://postgres:drwho@localhost/twitter'
engine = create_engine(db_string)
Base.metadata.create_all(engine)
session = Session(bind=engine)


In [70]:
class TweetData (Base):
    __tablename__ = "tweet_data"
    id = Column(Integer, primary_key = True)
    name = Column(String)
    date = Column(String)
    retweet_count = Column(Integer)
    tweet_text = Column(String)
    tweet_cleaned = Column(String)
    favorite_count = Column(Integer)
    est_positivity = Column(Integer)

In [None]:
makeModel = False
tweets = session.query(TweetData)

In [85]:

for tweet in tweets:
    print(tweet.tweet_text)
    cleanTweet = preprocess_tweet_text(tweet.tweet_text)
    
    v = cleanTweet.split(" ")
    print(v)
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(v)
    
    X = vector.transform(v)
    
    print(X)


    
    if makeModel:
        blob_object = TextBlob(cleanTweet, analyzer=NaiveBayesAnalyzer())
        analysis = blob_object.sentiment

        c = analysis.classification
        p = analysis.p_pos
        n = analysis.p_neg

        if p < 0.4:
            r = -1
        elif p > 0.6:
            r = 1
        else:
            r = 0
    else:
        r = NB_model.predict(X)
        
        
    tweet.est_positivity = r 
    session.update(TweetData).values(est_positivity=r)
    session.commit()


 Just got #engaged to the brightest #star in this dark sky. She’s my #lighthouse in the #storm. 

['got', 'engaged', 'bright', 'star', 'dark', 'sky', '’', 'lighthouse', 'storm']
  (0, 3)	1.0
  (1, 2)	1.0
  (2, 0)	1.0
  (3, 6)	1.0
  (4, 1)	1.0
  (5, 5)	1.0
  (7, 4)	1.0
  (8, 7)	1.0


ValueError: dimension mismatch