This project uses nltk library to process text data using stemming and stopword techniques and then TfidfVectorizer to encode
the data before training a ML predictive algorithim. 

This model could be used to analyze the reviews of a new violent video game by twitter users. Context is important because 
words that would typically be recognized as negative (ex: kill, bomb, die ...) may be part of a positive review in this case. 

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')  #words that do not contribute to contextual importance

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micls\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# DATA

In [3]:
#Link: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis

column_names = ['game', 'target', 'tweet']
twitter_data = pd.read_csv('C:/Users/micls/OneDrive/Desktop/ML Projects/twitter_gaming.csv', names = column_names )
print(twitter_data.shape)
twitter_data.head()


(73824, 3)


Unnamed: 0,game,target,tweet
0,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,Borderlands,Positive,I am coming to the borders and I will kill you...
2,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,Borderlands,Positive,im coming on borderlands and i will murder you...
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
twitter_data.isnull().sum()  #count missing values

game      0
target    0
tweet     0
dtype: int64

In [5]:
#Target distribution
(twitter_data['target'].value_counts()/73824)*100 #percent of total

Negative      30.223234
Positive      27.929941
Neutral       24.451398
Irrelevant    17.395427
Name: target, dtype: float64

In [6]:
#convert targets to int? 

Stemming - reducing word to thier roots

In [7]:
port_stem = PorterStemmer()  # Initializing a Porter Stemmer object for word stemming

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ', content)  # Removing non-alphabetic characters
    stemmed_content = stemmed_content.lower().split()  # Converting text to lowercase and splitting into words
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]  # Stemming each word using Porter Stemmer if it's not a stopword
    stemmed_content = ' '.join(stemmed_content)  # Joining the stemmed words back into a string
    
    return stemmed_content  # Returning the stemmed content

    

In [8]:
twitter_data['stemmed_content'] = twitter_data['tweet'].apply(stemming)

In [9]:
twitter_data.head()

Unnamed: 0,game,target,tweet,stemmed_content
0,Borderlands,Positive,im getting on borderlands and i will murder yo...,im get borderland murder
1,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,Borderlands,Positive,im getting on borderlands and i will kill you ...,im get borderland kill
3,Borderlands,Positive,im coming on borderlands and i will murder you...,im come borderland murder
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im get borderland murder


# Train Test split

In [10]:
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size= .2, stratify = Y, random_state=2)

In [12]:
#text data to numerical
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)

X_test = vectorizer.transform(X_test)

# Training Model

In [13]:
LogReg = LogisticRegression(max_iter = 1000)
LogReg.fit(X_train, Y_train)

Model evaluation

In [14]:
X_train_prediction = LogReg.predict(X_train)
train_accuracy = accuracy_score(Y_train, X_train_prediction)
train_accuracy

0.8358251917573951

In [15]:
X_test_prediction = LogReg.predict(X_test)
test_accuracy = accuracy_score(Y_test, X_test_prediction)
test_accuracy

0.7781916694886556

In [16]:
#using the model on new data: 
#X_new = #load new data
#prediction = LogReg.predict(X_new)
#print(prediction)