In [1]:
# import required libraries
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd
import string

In [2]:
# download punctuation and stopwords from nltk
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/rg1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/rg1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Transform the data

In [3]:
# load tweets_df and view
tweets_df = pd.read_csv("Resources/Tweets.csv")
tweets_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [4]:
# get dataframe ready for processing

# make sure the tweets in column "text" are strings
tweets_df['text'] = tweets_df['text'].astype('str')

# delete the unneccessary columns
tweets_df = tweets_df.drop(columns=["textID", "selected_text"])

In [5]:
# tweets_df_clean = tweets_df
# import re
# tweets_df_clean['text'] = tweets_df['text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

# tweets_df = tweets_df_clean

# tweets_df['sentiment'] = tweets_df['sentiment'].astype('str')
# tweets_df.dtypes

In [6]:
def process_tweets(tweet):
    # make the text all lowercase
    tweet = tweet.lower()
    
    # remove punctuation
    tweet = "".join(char for char in tweet if char not in string.punctuation)
    
    # tokenize the tweet for url clean
    tokenize_tweet_url = word_tokenize(tweet)
    
    # remove urls
    tokenize_tweet_url = " ".join([i for i in tokenize_tweet_url if 'http' not in i])
    
    # tokenize the tweet
    tokenize_tweets = word_tokenize(tokenize_tweet_url)
    
    # remove stopwords
    stopword = stopwords.words("english")
    tweet_wo_stop = [word for word in tokenize_tweets if word not in stopword]
    
    # put string together
    final_tweet = " ".join(tweet_wo_stop)
    
    return final_tweet

In [7]:
# process tweets using above function
tweets_df['text'] = tweets_df['text'].apply(lambda x: process_tweets(x))
tweets_df = tweets_df.dropna()

# view updated dataframe
tweets_df.head()

Unnamed: 0,text,sentiment
0,id responded going,neutral
1,sooo sad miss san diego,negative
2,boss bullying,negative
3,interview leave alone,negative
4,sons couldnt put releases already bought,negative


In [8]:
# transform the sentiment column into numbers
dict_sentiment = {'positive': 1, 'neutral': 0, 'negative': -1}
tweets_df['sentiment'] = tweets_df['sentiment'].apply(lambda x: dict_sentiment.get(x))

# view updated dataframe
tweets_df.head()

Unnamed: 0,text,sentiment
0,id responded going,0
1,sooo sad miss san diego,-1
2,boss bullying,-1
3,interview leave alone,-1
4,sons couldnt put releases already bought,-1


In [9]:
# Create the X and y data 
def create_train_test_data (tweets_df):
    
    # assign X and y to the input and target columns
    X = tweets_df['text']
    y = tweets_df['sentiment']

    # split the data into testing data and training data
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # transform the data into tfidf vectors
    # fit the tfidf vectorizer on the training data to avoid bias
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    # save te train and test data into a dictionary
    train_test_data_dict = {}
    train_test_data_dict["X_train"]       = X_train
    train_test_data_dict["X_test"]        = X_test
    train_test_data_dict["y_train"]       = y_train
    train_test_data_dict["y_test"]        = y_test
    train_test_data_dict["X_train_tfidf"] = X_train_tfidf
    train_test_data_dict["X_test_tfidf"]  = X_test_tfidf
    
    return train_test_data_dict

# Create the Logistic Regression Model

In [10]:
def create_model(tweets_df, attempt_num):
    # Get the train and test data
    train_test_data_dict = create_train_test_data (tweets_df)

    X_train       = train_test_data_dict.get("X_train")
    X_test        = train_test_data_dict.get("X_test")
    y_train       = train_test_data_dict.get("y_train")       
    y_test        = train_test_data_dict.get("y_test")
    X_train_tfidf = train_test_data_dict.get("X_train_tfidf")
    X_test_tfidf  = train_test_data_dict.get("X_test_tfidf")

    # create a logistic regression model and fit it to the training data
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression()
    model.fit(X_train_tfidf, y_train)
    
    # look at the scores for the testing and training data
    print(f"Attempt {attempt_num} ----------------------------------------")
    print(f"Training Data Score: {model.score(X_train_tfidf, y_train)}")
    print(f"Testing Data Score: {model.score(X_test_tfidf, y_test)} \n")
        
    from sklearn.metrics import classification_report, confusion_matrix

    # find metrics for testing data
    print(confusion_matrix(y_test.values, model.predict(X_test_tfidf)))
    print(classification_report(y_test.values, model.predict(X_test_tfidf)))

# Compile, Train, and Evaluate the Model

### Attempt 1:  Using all data in the df including sentiment = positive, negative and neutral

In [11]:
# Call the model  
attempt_num = 1
create_model(tweets_df, attempt_num)

Attempt 1 ----------------------------------------
Training Data Score: 0.8268316351285784
Testing Data Score: 0.6933488575171008 

[[1174  712  122]
 [ 334 2100  333]
 [  83  523 1490]]
              precision    recall  f1-score   support

          -1       0.74      0.58      0.65      2008
           0       0.63      0.76      0.69      2767
           1       0.77      0.71      0.74      2096

    accuracy                           0.69      6871
   macro avg       0.71      0.68      0.69      6871
weighted avg       0.70      0.69      0.69      6871



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Attempt 2: Wtihout sentiment = neutral

In [12]:
# delete all rows with column 'sentiment' = 0
indexSentiment = tweets_df[ (tweets_df['sentiment'] == 0)  ].index
tweets_df.drop(indexSentiment , inplace=True)

# Call the model  
attempt_num = 2
create_model(tweets_df, attempt_num)

Attempt 2 ----------------------------------------
Training Data Score: 0.9323663624511083
Testing Data Score: 0.8672696162307504 

[[1728  244]
 [ 299 1820]]
              precision    recall  f1-score   support

          -1       0.85      0.88      0.86      1972
           1       0.88      0.86      0.87      2119

    accuracy                           0.87      4091
   macro avg       0.87      0.87      0.87      4091
weighted avg       0.87      0.87      0.87      4091

