In [1]:
# import required libraries
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer

import pandas as pd
import string

In [2]:
# download punctuation and stopwords from nltk
nltk.download('punkt')
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\apfle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\apfle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\apfle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\apfle\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# load tweets_df and view
tweets_df = pd.read_csv("Resources/Tweets.csv")
tweets_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [4]:
# get dataframe ready for processing

# make sure the tweets in column "text" are strings
tweets_df['text'] = tweets_df['text'].astype('str')

# delete the unneccessary columns
tweets_df = tweets_df.drop(columns=["textID", "selected_text"])

In [5]:
def process_tweets(tweet):
    # make the text all lowercase
    tweet = tweet.lower()
    
    # remove punctuation
    tweet = "".join(char for char in tweet if char not in string.punctuation)
    
    # tokenize the tweet for url clean
    tokenize_tweet_url = word_tokenize(tweet)
    
    # remove urls
    tokenize_tweet_url = " ".join([i for i in tokenize_tweet_url if 'http' not in i])
    
    # tokenize the tweet
    tokenize_tweets = word_tokenize(tokenize_tweet_url)
    
    # remove stopwords
    stopword = stopwords.words("english")
    tweet_wo_stop = [word for word in tokenize_tweets if word not in stopword]
    
    # lemmatization
    lemm = WordNetLemmatizer()
    lemmed = [lemm.lemmatize(word) for word in tweet_wo_stop]
    
    # put string together
    final_tweet = " ".join(lemmed)
    
    return final_tweet

In [6]:
# process tweets using above function
tweets_df['text'] = tweets_df['text'].apply(lambda x: process_tweets(x))
tweets_df = tweets_df.dropna()

# view updated dataframe
tweets_df.head()

Unnamed: 0,text,sentiment
0,id responded going,neutral
1,sooo sad miss san diego,negative
2,bos bullying,negative
3,interview leave alone,negative
4,son couldnt put release already bought,negative


In [7]:
# transform the sentiment column into numbers
dict_sentiment = {'positive': 1, 'neutral': 0, 'negative': -1}
tweets_df['sentiment'] = tweets_df['sentiment'].apply(lambda x: dict_sentiment.get(x))

# view updated dataframe
tweets_df.head()

Unnamed: 0,text,sentiment
0,id responded going,0
1,sooo sad miss san diego,-1
2,bos bullying,-1
3,interview leave alone,-1
4,son couldnt put release already bought,-1


In [8]:
# delete all rows with column 'sentiment' = 0
indexSentiment = tweets_df[ (tweets_df['sentiment'] == 0)  ].index
tweets_df.drop(indexSentiment , inplace=True)
tweets_df.head()

Unnamed: 0,text,sentiment
1,sooo sad miss san diego,-1
2,bos bullying,-1
3,interview leave alone,-1
4,son couldnt put release already bought,-1
6,2am feeding baby fun smile coo,1


In [9]:
# assign X and y to the input and target columns
X = tweets_df['text']
y = tweets_df['sentiment']

In [10]:
# split the data into testing data and training data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
# transform the data into tfidf vectors
# fit the tfidf vectorizer on the training data to avoid bias

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Logistic Regression

In [12]:
# create a logistic regression model and fit it to the training data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [13]:
# look at the scores for the testing and training data
print(f"Training Data Score: {model.score(X_train_tfidf, y_train)}")
print(f"Testing Data Score: {model.score(X_test_tfidf, y_test)}")

Training Data Score: 0.931877444589309
Testing Data Score: 0.8618919579564899


In [14]:
from sklearn.metrics import classification_report, confusion_matrix

# find metrics for testing data
print(confusion_matrix(y_test.values, model.predict(X_test_tfidf)))
print(classification_report(y_test.values, model.predict(X_test_tfidf)))

[[1638  263]
 [ 302 1888]]
              precision    recall  f1-score   support

          -1       0.84      0.86      0.85      1901
           1       0.88      0.86      0.87      2190

    accuracy                           0.86      4091
   macro avg       0.86      0.86      0.86      4091
weighted avg       0.86      0.86      0.86      4091



## Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier

twitter_classi = RandomForestClassifier(n_estimators=300, random_state=0)
twitter_classi.fit(X_train_tfidf, y_train)

In [16]:
print('RandomForestClassifier Score: ', twitter_classi.score(X_train_tfidf, y_train))

RandomForestClassifier Score:  0.9997555410691004


In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test.values, twitter_classi.predict(X_test_tfidf)))
print(classification_report(y_test.values, twitter_classi.predict(X_test_tfidf)))
print(accuracy_score(y_test.values, twitter_classi.predict(X_test_tfidf)))

[[1645  256]
 [ 349 1841]]
              precision    recall  f1-score   support

          -1       0.82      0.87      0.84      1901
           1       0.88      0.84      0.86      2190

    accuracy                           0.85      4091
   macro avg       0.85      0.85      0.85      4091
weighted avg       0.85      0.85      0.85      4091

0.8521143974578342


## Extra Trees Classifier

In [18]:
# code

## Ada Boost Classifier

In [19]:
# code

## Naive Bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
y_pred = mnb.fit(X_train_tfidf, y_train).predict(X_test_tfidf)

from sklearn import metrics
accuracy_score = metrics.accuracy_score(y_pred, y_test)

print(accuracy_score)

0.8508922023955023


nigel and kelly add their code here

In [21]:
# import os
# # Find the latest version of spark 3.2  from http://www.apache.org/dist/spark/ and enter as the spark version
# # For example:
# spark_version = 'spark-3.2.3'
# # spark_version = 'spark-3.<enter version>'
# os.environ['SPARK_VERSION']=spark_version

# # Install Spark and Java
# !apt-get update
# !apt-get install openjdk-8-jdk-headless -qq > /dev/null
# !wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
# !tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
# !pip install -q findspark

# # Set Environment Variables
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# # Start a SparkSession
# import findspark
# findspark.init()