<h1> <b> Sentiment Analysis Project </b> </h1>
Team members: Mohamed Tarek, Hady Ahmed, Yousef Ahmed, Mohamed Gaber, Mohamed Allam, and Momen Mohamed <br>

T.A : Andrew Magdy


<h3> Importing Libraries </h3>

In [None]:
!pip install emot
!pip install fastapi
!pip install streamlit

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
import nltk
import string
import emot
import gensim
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import streamlit as st
import joblib
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
pd.options.display.max_rows = 4000
pd.options.display.max_seq_items = 2000


<h3> Data Preprocessing </h3>

---



---


Mohamed Allam: Removing punctuation and stop words, and Lowercasing text.
<br>
Mohamed Tarek: Text normalization and Stemming or lemmatization

In [None]:
# importing data
data = pd.read_csv("/content/sentimentdataset.csv")

In [None]:
print(data.head())

In [None]:
print(data.info())

In [None]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'])

In [None]:
print(data.info())

In [None]:
data.describe(include=['O'])

In [None]:
data.drop_duplicates(subset=['Text'],inplace=True)

In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
data.describe(include=['O'])

In [None]:
data.describe()

In [None]:
print(data["Country"].unique())

In [None]:
data['Country'] = data['Country'].str.strip()

In [None]:
print(data["Country"].unique())

In [None]:
print(data["Sentiment (Label)"].unique())

In [None]:
data["Sentiment (Label)"] = data["Sentiment (Label)"].str.strip()

In [None]:
print(data["Sentiment (Label)"].unique())

In [None]:
data["Sentiment (Label)"] = data["Sentiment (Label)"].str.lower()

In [None]:
print(data["Sentiment (Label)"].unique())

In [None]:
# label = data["Sentiment (Label)"]
# print(label.nunique())
# label = label.apply(change_to_orgin_stemmer)
# print(np.sort(label.unique()))
# print(label.nunique())

In [None]:
print(np.sort(data["Sentiment (Label)"].unique()))

In [None]:
positive = ['acceptance', 'accomplishment', 'admiration', 'adoration', 'adrenaline',
            'adventure', 'affection', 'amazement', 'amusement', 'anticipation',
            'arousal','artisticBurst', 'awe', 'bittersweet', 'blessed',
            'breakthrough', 'calmness', 'captivation', 'joy', 'excitement',
            'contentment', 'serenity', 'happy', 'nostalgia','hopeful',
            'euphoria', 'elation', 'enthusiasm', 'pride',
            'determination', 'playful', 'surprise', 'inspiration', 'positive']

neutral = ['ambivalence', 'apprehensive', 'boredom', 'curiosity',
           'confusion', 'indifference', 'neutral']

negative = ['anger', 'anxiety', 'bad', 'betrayal', 'bitter', 'bitterness', 'despair',
            'grief', 'sad', 'loneliness', 'embarrassed', 'regret', 'frustration',
            'melancholy', 'numbness', 'hate', 'negative']

In [None]:
print(data["Sentiment (Label)"].value_counts());

In [None]:
def word_polarity(text):
    polarity = TextBlob(text)
    if text in neutral:
      return "Neutral"
    elif text in positive or polarity.sentiment.polarity >= 0.05 :
        return "Positive"
    elif text in negative or polarity.sentiment.polarity <= -0.05:
        return "Negative"
    else:
        return "Neutral"

In [None]:
def word_polarity2(text):
    sid = SentimentIntensityAnalyzer()
    polarity = sid.polarity_scores(text)['compound']
    if text in neutral:
        return "Neutral"
    elif text in positive or polarity >= 0.05:
        return "Positive"
    elif text in negative or polarity <= -0.05:
        return "Negative"
    else:
        return "Neutral"

In [None]:
output = data["Sentiment (Label)"].apply(word_polarity)
print(output.value_counts())

In [None]:
output = data["Sentiment (Label)"].apply(word_polarity2)
data["Sentiment (Label)"] = output
print(output.value_counts())

In [None]:
output.unique()

In [None]:
lbl = LabelEncoder()
lbl.fit(list(output))
output = lbl.transform(list(output))
output = pd.DataFrame(output)
print(output)
# lbl = LabelEncoder()
# lbl.fit(list(data["Sentiment (Label)"]))
# data["Sentiment (Label)"] = lbl.transform(list(data["Sentiment (Label)"]))
# output = pd.DataFrame(output)
# print(output)

In [None]:
reviews = data["Text"]

In [None]:
l = pd.concat([reviews, output], axis=1)
print(l)

<h4> Remove Additional Spaces </h4>

In [None]:
reviews = reviews.apply(str.strip)

<h4> Lowering the Case of Letters </h4>

In [None]:
reviews = reviews.apply(str.lower)

<h4> Removing Hashtags </h4>

In [None]:
def remove_hashtags(text):
  text = text.split()
  text = [word for word in text if "#" not in word]
  # to return it to one string
  text = ' '.join(text)
  return text

In [None]:
reviews = reviews.apply(remove_hashtags)

<h4> Remove Punctuation </h4>

In [None]:
def remove_punctuation(text):
  punctuation = string.punctuation
  # second argument in maketrans specify what to do with the word that you want to replace
  # third argument in maketrans specify what you want to remove
  punctuation = punctuation.translate(str.maketrans("","","'"))
  text = text.translate(str.maketrans("","",punctuation))
  return text

In [None]:
reviews = reviews.apply(remove_punctuation)

<h4> Remove Stop Words </h4>

In [None]:
def remove_stop_words(text):
  stop_words = set(stopwords.words('english'))
  text = text.split()
  text = [word for word in text if word not in stop_words]
  text = ' '.join(text)
  return text

In [None]:
stop_words = set(stopwords.words('english'))
print("not" in stop_words)

In [None]:
reviews = reviews.apply(remove_stop_words)

<h4> Remove dash </h4>

In [None]:
def remove_dash(text):
  text = text.translate(str.maketrans("","","'"))
  return text

In [None]:
reviews = reviews.apply(remove_dash)

In [None]:
remove_dash("i'am")

<h4> Replacing Emojies with Text </h4>

In [None]:
def replace_emojies(text):
  emojies_to_words = {value : key.replace(":","") for key,value in emot.EMOJI_UNICODE.items()}
  emojies_to_words['❤️'] = "heavy_black_heart"
  text = text.split()
  text = [emojies_to_words[word] if word in emojies_to_words.keys() else word for word in text ]
  text = ' '.join(text)
  return text

In [None]:
reviews = reviews.apply(replace_emojies)

<h4> Replace Words With their Origin using Stemmer</h4>

In [None]:
def change_to_orgin_stemmer(text):
  porter = PorterStemmer()
  text = text.split()
  text = [porter.stem(word) for word in text]
  text = ' '.join(text)
  return text

In [None]:
reviews_stemmer = reviews.apply(change_to_orgin_stemmer)

<h4> Replace Words With their Origin using Lemmatizer</h4>

In [None]:
def change_to_orgin_lemmatizer(text):
  lemmatizer = WordNetLemmatizer()
  wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
  text = text.split()
  text = nltk.pos_tag(text)
  text = [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in text]
  text = ' '.join(text)
  return text

In [None]:
reviews_lemmatizer = reviews.apply(change_to_orgin_lemmatizer)

<h4> Observing the changes </h4>

In [None]:
print(reviews)

In [None]:
print(reviews_stemmer)

In [None]:
print(reviews_lemmatizer)

In [None]:
print(replace_emojies("game is on 🥇"))

In [None]:
print(lbl.transform([lbl.classes_[0]])[0])

<h3>Data Visualizations </h3>

In [None]:
data['Text'] = reviews_lemmatizer

In [None]:
plt.figure(figsize=(10,8))

sns.countplot(data=data, y='Country', hue='Sentiment (Label)', palette="viridis_r", order=data['Country'].value_counts().iloc[:20].index)
plt.title("Count of The Positive, Negative, and Neutral in Each Country")

In [None]:
import matplotlib.dates as mdates

fig, ax = plt.subplots(1, 1)

fig.set_size_inches(10, 8)

positive_USA = data[(data['Sentiment (Label)'] == 'Positive')]
positive_USA = positive_USA.groupby(data['Timestamp'].dt.year).count()

x = positive_USA.index.to_list()
x.append(2014)
x.sort()
plt.xlim(positive_USA.index.min(), positive_USA.index.max() + 1)

plt.xticks(x)

sns.lineplot(ax=ax, data=positive_USA, x=positive_USA.index, y='ID', color='green')
sns.scatterplot(ax=ax, data=positive_USA, x=positive_USA.index, y='ID', color='green')

positive_USA = data[(data['Sentiment (Label)'] == 'Negative')]
positive_USA = positive_USA.groupby(data['Timestamp'].dt.year).count()

sns.lineplot(ax=ax, data=positive_USA, x=positive_USA.index, y='ID', color='red')
sns.scatterplot(ax=ax, data=positive_USA, x=positive_USA.index, y='ID', color='red')

positive_USA = data[(data['Sentiment (Label)'] == 'Neutral')]
positive_USA = positive_USA.groupby(data['Timestamp'].dt.year).count()

sns.lineplot(ax=ax, data=positive_USA, x=positive_USA.index, y='ID', color='blue')
sns.scatterplot(ax=ax, data=positive_USA, x=positive_USA.index, y='ID', color='blue')

ax.legend(handles=ax.lines, labels=["Positive", "Negative", "Neutral"])

plt.tick_params(axis = 'x', labelrotation = 45)

plt.title("Sentiment Across The Years")
plt.ylabel("Count")
plt.xlabel("Years")

In [None]:
words = {}
for i in data['Text']:
  word = i.split()
  for j in word:
    if j in words.keys():
      words[j] = words[j] + 1
    else:
      words[j] = 1
sns.barplot(x=words.keys(), y=words.values(), order=sorted(words.keys()))


<h3> Feature Engineering </h3>
Momen Mohamed: Word Embeddings
<br>
Mohamed Gaber: Bag-of-Word
<br>
Hady Ahmed: TF-IDF

<h4> Checking How Many Words in the Dataset </h4>

In [None]:
tokenized_reviews_stemmer = reviews_stemmer.apply(lambda x: x.split())
print(tokenized_reviews_stemmer.shape)
x=0
for i in tokenized_reviews_stemmer :
    x=x+len(i)
print(x)

In [None]:
tokenized_reviews_lemmatizer = reviews_lemmatizer.apply(lambda x: x.split())
print(tokenized_reviews_lemmatizer.shape)
x=0
for i in tokenized_reviews_lemmatizer :
    x=x+len(i)
print(x)

<h4> Word Embedding using word2vec </h4>

In [None]:
vector_size = 100

In [None]:
tokenized_reviews_lemmatizer = reviews_lemmatizer.apply(lambda x: x.split()) # tokenizing

model_w2v_lemmatizer = gensim.models.Word2Vec(
            tokenized_reviews_lemmatizer,
            vector_size=vector_size, # desired no. of features/independent variables , important
            window=12, # context window size ,number of words consider the meaning of the word
            min_count=2,# minimum number of word repiutation to be used in training
            sg = 1, # 1 for skip-gram model , to choose the training model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 2, # no.of cpu cores to train the model
            seed = 34)

model_w2v_lemmatizer.train(tokenized_reviews_lemmatizer, total_examples= len(reviews_lemmatizer), epochs=20)

In [None]:
tokenized_reviews_stemmer = reviews_stemmer.apply(lambda x: x.split()) # tokenizing

model_w2v_stemmer = gensim.models.Word2Vec(
            tokenized_reviews_stemmer,
            vector_size=vector_size, # desired no. of features/independent variables , important
            window=12, # context window size ,number of words consider the meaning of the word
            min_count=2,# minimum number of word repiutation to be used in training
            sg = 1, # 1 for skip-gram model , to choose the training model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 2, # no.of cpu cores to train the model
            seed = 34)

model_w2v_stemmer.train(tokenized_reviews_stemmer, total_examples= len(reviews_stemmer), epochs=20)

In [None]:
def word_vector_stemmer(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v_stemmer.wv.get_vector(word).reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary

            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
def word_vector_lemmatizer(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v_lemmatizer.wv.get_vector(word).reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary

            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
wordvec_arrays = np.zeros((len(tokenized_reviews_stemmer), vector_size))

for i in range(len(tokenized_reviews_stemmer)):
    wordvec_arrays[i,:] = word_vector_stemmer(tokenized_reviews_stemmer[i], vector_size)

reviews_stemmer_word_embedding = pd.DataFrame(wordvec_arrays)

In [None]:
wordvec_arrays = np.zeros((len(tokenized_reviews_lemmatizer), vector_size))

for i in range(len(tokenized_reviews_lemmatizer)):
    wordvec_arrays[i,:] = word_vector_lemmatizer(tokenized_reviews_lemmatizer[i], vector_size)

reviews_lemmatize_word_embedding = pd.DataFrame(wordvec_arrays)
print(reviews_lemmatize_word_embedding.shape)

<h4>Bag of Words</h4>

In [None]:
max_features = 2000
max_df = 0.7
min_df = 2

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer_stemmer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english')
reviews_stemmer_bag_of_words = bow_vectorizer_stemmer.fit_transform(reviews_stemmer)

reviews_stemmer_bag_of_words = pd.DataFrame.sparse.from_spmatrix(reviews_stemmer_bag_of_words)
reviews_stemmer_bag_of_words = reviews_stemmer_bag_of_words.sparse.to_dense()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer_lemmatizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english')
reviews_lemmatizer_bag_of_words = bow_vectorizer_lemmatizer.fit_transform(reviews_lemmatizer)
print(reviews_lemmatizer_bag_of_words.shape)
print(type(reviews_lemmatizer_bag_of_words))
reviews_lemmatizer_bag_of_words = pd.DataFrame.sparse.from_spmatrix(reviews_lemmatizer_bag_of_words)
reviews_lemmatizer_bag_of_words = reviews_lemmatizer_bag_of_words.sparse.to_dense()

<h4>TF-IDF</h4>

In [None]:
#TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer_stemmer = TfidfVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english')
reviews_stemmer_tfidf = tfidf_vectorizer_stemmer.fit_transform(reviews_stemmer)

reviews_stemmer_tfidf = pd.DataFrame.sparse.from_spmatrix(reviews_stemmer_tfidf)
reviews_stemmer_tfidf = reviews_stemmer_tfidf.sparse.to_dense()

In [None]:
#TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer_lemmatizer = TfidfVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english')
reviews_lemmatizer_tfidf = tfidf_vectorizer_lemmatizer.fit_transform(reviews_lemmatizer)
print(type(reviews_lemmatizer_tfidf))
print(reviews_lemmatizer_tfidf.shape)
reviews_lemmatizer_tfidf = pd.DataFrame.sparse.from_spmatrix(reviews_lemmatizer_tfidf)
reviews_lemmatizer_tfidf = reviews_lemmatizer_tfidf.sparse.to_dense()
print(reviews_lemmatizer_tfidf.shape)

<h3> Model Selection and Training </h3>
Mohamed Allam & Momen Mohamed: Naive Bayes
<br>
Mohamed Tarek & Mohamed Gaber: SVM
<br>
Yousef Ahmed & Hady Ahmed: Random Forest Classification
<br>
all will try the different datasets will be presented from feature engineering different algorithm

In [None]:
data.info()

In [None]:
data["Sentiment (Label)"] = output

In [None]:
data.select_dtypes(int).corr()

In [None]:
reviews_datasets = [reviews_lemmatizer_tfidf,reviews_stemmer_tfidf, reviews_lemmatizer_bag_of_words,
                    reviews_stemmer_bag_of_words, reviews_lemmatize_word_embedding,
                    reviews_stemmer_word_embedding, output]

In [None]:
reviews_datasets_names = ["reviews_lemmatizer_tfidf","reviews_stemmer_tfidf", "reviews_lemmatizer_bag_of_words",
                    "reviews_stemmer_bag_of_words", "reviews_lemmatize_word_embedding",
                    "reviews_stemmer_word_embedding", "output"]

<h4> Naive Bayes Model </h4>

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

# Build a Gaussian Classifier
predictions = []
counter = -1
for i in reviews_datasets:
  model = MultinomialNB(alpha=0.2, fit_prior=True, force_alpha=True)
  counter = counter + 1
  if counter == 4:
    break
  X_train, X_test, y_train, y_test = train_test_split(i, reviews_datasets[-1], test_size=0.25, random_state=81)
#   sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
#   sss.get_n_splits(i, reviews_datasets[-1])

#   scores = []
#   s = []
# # using regression to get predicted data
#   for train_index, test_index in sss.split(i, reviews_datasets[-1]):
#      X_train, X_test = i.iloc[train_index, :], i.iloc[test_index, :]
#      y_train, y_test = reviews_datasets[-1].iloc[train_index, :], reviews_datasets[-1].iloc[test_index, :]
#      model.fit(X_train, y_train)
#      pred = model.predict(X_test)
#      scores.append(accuracy_score(y_test, pred))
#      pred = model.predict(X_train)
#      s.append(accuracy_score(y_train, pred))

#get accuracy of each prediction

# Model training
  model.fit(X_train, y_train)

# Predict Output
  test_predict = model.predict(X_train)
  predicted = model.predict(X_test)

  predictions.append(predicted)
  print("Dataset: ",reviews_datasets_names[counter])
  print("Train Accuracy: ", accuracy_score(y_train, test_predict))
  print("Test Accuracy: ", accuracy_score(y_test, predicted))
  print("Train Report: ", metrics.classification_report(y_train, test_predict))
  print("Test Report: ", metrics.classification_report(y_test, predicted))

  metrics.recall_score
  metrics.precision_score

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

# Build a Gaussian Classifier
counter = -1
for i in reviews_datasets:
  counter = counter + 1
  model = GaussianNB()
  X_train, X_test, y_train, y_test = train_test_split(i, reviews_datasets[-1], test_size=0.25, random_state=61)
#   sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
#   sss.get_n_splits(i, reviews_datasets[-1])

#   scores = []
#   s = []
# # using regression to get predicted data
#   for train_index, test_index in sss.split(i, reviews_datasets[-1]):
#      X_train, X_test = i.iloc[train_index, :], i.iloc[test_index, :]
#      y_train, y_test = reviews_datasets[-1].iloc[train_index, :], reviews_datasets[-1].iloc[test_index, :]
#      model.fit(X_train, y_train)
#      pred = model.predict(X_test)
#      scores.append(accuracy_score(y_test, pred))
#      pred = model.predict(X_train)
#      s.append(accuracy_score(y_train, pred))

#get accuracy of each prediction

# Model training
  model.fit(X_train, y_train)

# Predict Output
  test_predict = model.predict(X_train)
  predicted = model.predict(X_test)

  predictions.append(predicted)
  print("Dataset: ",reviews_datasets_names[counter])
  print("Train Accuracy: ", accuracy_score(y_train, test_predict))
  print("Test Accuracy: ", accuracy_score(y_test, predicted))
  print("Train Report: ", metrics.classification_report(y_train, test_predict))
  print("Test Report: ", metrics.classification_report(y_test, predicted))

  metrics.recall_score
  metrics.precision_score



















































<h4> SVM Model </h4>

In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

# Build a Gaussian Classifier
predictions = []
counter = -1
for i in reviews_datasets:
  model = SVC(kernel='sigmoid')
  counter = counter + 1
  X_train, X_test, y_train, y_test = train_test_split(i, reviews_datasets[-1], test_size=0.25, random_state=61)
#   sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
#   sss.get_n_splits(i, reviews_datasets[-1])

#   scores = []
#   s = []
# # using regression to get predicted data
#   for train_index, test_index in sss.split(i, reviews_datasets[-1]):
#      X_train, X_test = i.iloc[train_index, :], i.iloc[test_index, :]
#      y_train, y_test = reviews_datasets[-1].iloc[train_index, :], reviews_datasets[-1].iloc[test_index, :]
#      model.fit(X_train, y_train)
#      pred = model.predict(X_test)
#      scores.append(accuracy_score(y_test, pred))
#      pred = model.predict(X_train)
#      s.append(accuracy_score(y_train, pred))

#get accuracy of each prediction

# Model training
  model.fit(X_train, y_train)

# Predict Output
  test_predict = model.predict(X_train)
  predicted = model.predict(X_test)

  predictions.append(predicted)
  print("Dataset: ",reviews_datasets_names[counter])
  print("Train Accuracy: ", accuracy_score(y_train, test_predict))
  print("Test Accuracy: ", accuracy_score(y_test, predicted))
  print("Train Report: ", metrics.classification_report(y_train, test_predict))
  print("Test Report: ", metrics.classification_report(y_test, predicted))

  metrics.recall_score
  metrics.precision_score

In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

# Build a Gaussian Classifier
predictions = []
counter = -1
for i in reviews_datasets:
  model = SVC(kernel='rbf')
  counter = counter + 1
  X_train, X_test, y_train, y_test = train_test_split(i, reviews_datasets[-1], test_size=0.25, random_state=61)
#   sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
#   sss.get_n_splits(i, reviews_datasets[-1])

#   scores = []
#   s = []
# # using regression to get predicted data
#   for train_index, test_index in sss.split(i, reviews_datasets[-1]):
#      X_train, X_test = i.iloc[train_index, :], i.iloc[test_index, :]
#      y_train, y_test = reviews_datasets[-1].iloc[train_index, :], reviews_datasets[-1].iloc[test_index, :]
#      model.fit(X_train, y_train)
#      pred = model.predict(X_test)
#      scores.append(accuracy_score(y_test, pred))
#      pred = model.predict(X_train)
#      s.append(accuracy_score(y_train, pred))

#get accuracy of each prediction

# Model training
  model.fit(X_train, y_train)

# Predict Output
  test_predict = model.predict(X_train)
  predicted = model.predict(X_test)

  predictions.append(predicted)
  print("Dataset: ",reviews_datasets_names[counter])
  print("Train Accuracy: ", accuracy_score(y_train, test_predict))
  print("Test Accuracy: ", accuracy_score(y_test, predicted))
  print("Train Report: ", metrics.classification_report(y_train, test_predict))
  print("Test Report: ", metrics.classification_report(y_test, predicted))

  metrics.recall_score
  metrics.precision_score

<h4> Random Forest Model </h4>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

# Build a Gaussian Classifier
predictions = []
counter = -1
for i in reviews_datasets:
  model = RandomForestClassifier()
  counter = counter + 1
  X_train, X_test, y_train, y_test = train_test_split(i, reviews_datasets[-1], test_size=0.25, random_state=61)
#   sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
#   sss.get_n_splits(i, reviews_datasets[-1])

#   scores = []
#   s = []
# # using regression to get predicted data
#   for train_index, test_index in sss.split(i, reviews_datasets[-1]):
#      X_train, X_test = i.iloc[train_index, :], i.iloc[test_index, :]
#      y_train, y_test = reviews_datasets[-1].iloc[train_index, :], reviews_datasets[-1].iloc[test_index, :]
#      model.fit(X_train, y_train)
#      pred = model.predict(X_test)
#      scores.append(accuracy_score(y_test, pred))
#      pred = model.predict(X_train)
#      s.append(accuracy_score(y_train, pred))

#get accuracy of each prediction

# Model training
  model.fit(X_train, y_train)

# Predict Output
  test_predict = model.predict(X_train)
  predicted = model.predict(X_test)

  predictions.append(predicted)
  print("Dataset: ",reviews_datasets_names[counter])
  print("Train Accuracy: ", accuracy_score(y_train, test_predict))
  print("Test Accuracy: ", accuracy_score(y_test, predicted))
  print("Train Report: ", metrics.classification_report(y_train, test_predict))
  print("Test Report: ", metrics.classification_report(y_test, predicted))

  metrics.recall_score
  metrics.precision_score

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

# Build a Gaussian Classifier
predictions = []
counter = -1
for i in reviews_datasets:
  model = RandomForestClassifier(criterion="entropy")
  counter = counter + 1
  X_train, X_test, y_train, y_test = train_test_split(i, reviews_datasets[-1], test_size=0.25, random_state=61)
#   sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
#   sss.get_n_splits(i, reviews_datasets[-1])

#   scores = []
#   s = []
# # using regression to get predicted data
#   for train_index, test_index in sss.split(i, reviews_datasets[-1]):
#      X_train, X_test = i.iloc[train_index, :], i.iloc[test_index, :]
#      y_train, y_test = reviews_datasets[-1].iloc[train_index, :], reviews_datasets[-1].iloc[test_index, :]
#      model.fit(X_train, y_train)
#      pred = model.predict(X_test)
#      scores.append(accuracy_score(y_test, pred))
#      pred = model.predict(X_train)
#      s.append(accuracy_score(y_train, pred))

#get accuracy of each prediction

# Model training
  model.fit(X_train, y_train)

# Predict Output
  test_predict = model.predict(X_train)
  predicted = model.predict(X_test)

  predictions.append(predicted)
  print("Dataset: ",reviews_datasets_names[counter])
  print("Train Accuracy: ", accuracy_score(y_train, test_predict))
  print("Test Accuracy: ", accuracy_score(y_test, predicted))
  print("Train Report: ", metrics.classification_report(y_train, test_predict))
  print("Test Report: ", metrics.classification_report(y_test, predicted))

  metrics.recall_score
  metrics.precision_score

<h4> Multinomial Logistic Regression </h4>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit

# Build a Gaussian Classifier
predictions = []
counter = -1
for i in reviews_datasets:
  model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
  counter = counter + 1
  X_train, X_test, y_train, y_test = train_test_split(i, reviews_datasets[-1], test_size=0.25, random_state=61)
#   sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
#   sss.get_n_splits(i, reviews_datasets[-1])

#   scores = []
#   s = []
# # using regression to get predicted data
#   for train_index, test_index in sss.split(i, reviews_datasets[-1]):
#      X_train, X_test = i.iloc[train_index, :], i.iloc[test_index, :]
#      y_train, y_test = reviews_datasets[-1].iloc[train_index, :], reviews_datasets[-1].iloc[test_index, :]
#      model.fit(X_train, y_train)
#      pred = model.predict(X_test)
#      scores.append(accuracy_score(y_test, pred))
#      pred = model.predict(X_train)
#      s.append(accuracy_score(y_train, pred))

#get accuracy of each prediction

# Model training
  model.fit(X_train, y_train)

# Predict Output
  test_predict = model.predict(X_train)
  predicted = model.predict(X_test)

  predictions.append(predicted)
  print("Dataset: ",reviews_datasets_names[counter])
  print("Train Accuracy: ", accuracy_score(y_train, test_predict))
  print("Test Accuracy: ", accuracy_score(y_test, predicted))
  print("Train Report: ", metrics.classification_report(y_train, test_predict))
  print("Test Report: ", metrics.classification_report(y_test, predicted))

  metrics.recall_score
  metrics.precision_score

<h2> Model Evaluation </h2>

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

# Build a Gaussian Classifier
predictions = []
i = reviews_datasets[2]
model1 = GaussianNB()
counter = counter + 1
X_train, X_test, y_train, y_test = train_test_split(i, reviews_datasets[-1], test_size=0.25, random_state=61)
#   sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
#   sss.get_n_splits(i, reviews_datasets[-1])

#   scores = []
#   s = []
# # using regression to get predicted data
#   for train_index, test_index in sss.split(i, reviews_datasets[-1]):
#      X_train, X_test = i.iloc[train_index, :], i.iloc[test_index, :]
#      y_train, y_test = reviews_datasets[-1].iloc[train_index, :], reviews_datasets[-1].iloc[test_index, :]
#      model.fit(X_train, y_train)
#      pred = model.predict(X_test)
#      scores.append(accuracy_score(y_test, pred))
#      pred = model.predict(X_train)
#      s.append(accuracy_score(y_train, pred))

#get accuracy of each prediction

# Model training
model1.fit(X_train, y_train)

# Predict Output
test_predict = model1.predict(X_train)
predicted = model1.predict(X_test)

predictions.append(predicted)
print("Dataset: ",reviews_datasets_names[counter])
print("Train Accuracy: ", accuracy_score(y_train, test_predict))
print("Test Accuracy: ", accuracy_score(y_test, predicted))
print("Train Report: ", metrics.classification_report(y_train, test_predict))
print("Test Report: ", metrics.classification_report(y_test, predicted))

metrics.recall_score
metrics.precision_score

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

# Build a Gaussian Classifier
for i in range(0, 100):
  predictions = []
  model2 = MultinomialNB(alpha=0.2, fit_prior=True, force_alpha=True)
  X_train, X_test, y_train, y_test = train_test_split(reviews_datasets[0], reviews_datasets[-1], test_size=0.25, random_state=i)
#   sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
#   sss.get_n_splits(i, reviews_datasets[-1])

#   scores = []
#   s = []
# # using regression to get predicted data
#   for train_index, test_index in sss.split(i, reviews_datasets[-1]):
#      X_train, X_test = i.iloc[train_index, :], i.iloc[test_index, :]
#      y_train, y_test = reviews_datasets[-1].iloc[train_index, :], reviews_datasets[-1].iloc[test_index, :]
#      model.fit(X_train, y_train)
#      pred = model.predict(X_test)
#      scores.append(accuracy_score(y_test, pred))
#      pred = model.predict(X_train)
#      s.append(accuracy_score(y_train, pred))

#get accuracy of each prediction

# Model training
  model2.fit(X_train, y_train)

# Predict Output
  test_predict = model2.predict(X_train)
  predicted = model2.predict(X_test)

  predictions.append(predicted)

  print("Dataset: ",reviews_datasets_names[counter])
  print(i)
  print("Train Accuracy: ", accuracy_score(y_train, test_predict))
  print("Test Accuracy: ", accuracy_score(y_test, predicted))
  print("Train Report: ", metrics.classification_report(y_train, test_predict))
  print("Test Report: ", metrics.classification_report(y_test, predicted))

metrics.recall_score
metrics.precision_score

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

# Build a Gaussian Classifier
predictions = []
model2 = MultinomialNB(alpha=0.2, fit_prior=True, force_alpha=True)
X_train, X_test, y_train, y_test = train_test_split(reviews_datasets[0], reviews_datasets[-1], test_size=0.25, random_state=81)
#   sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
#   sss.get_n_splits(i, reviews_datasets[-1])

#   scores = []
#   s = []
# # using regression to get predicted data
#   for train_index, test_index in sss.split(i, reviews_datasets[-1]):
#      X_train, X_test = i.iloc[train_index, :], i.iloc[test_index, :]
#      y_train, y_test = reviews_datasets[-1].iloc[train_index, :], reviews_datasets[-1].iloc[test_index, :]
#      model.fit(X_train, y_train)
#      pred = model.predict(X_test)
#      scores.append(accuracy_score(y_test, pred))
#      pred = model.predict(X_train)
#      s.append(accuracy_score(y_train, pred))

#get accuracy of each prediction

# Model training
model2.fit(X_train, y_train)

# Predict Output
test_predict = model2.predict(X_train)
predicted = model2.predict(X_test)

predictions.append(predicted)

print(reviews_datasets_names[0])
print(accuracy_score(y_train, test_predict))
print(accuracy_score(y_test, predicted))
print(metrics.classification_report(y_train, test_predict))
print(metrics.classification_report(y_test, predicted))

metrics.recall_score
metrics.precision_score

In [None]:
from fastapi import FastAPI
app = FastAPI()

In [None]:
joblib.dump(tfidf_vectorizer_lemmatizer, "tfidf.pkl")

In [None]:
joblib.dump(bow_vectorizer_lemmatizer, "bag_of_words.pkl")

In [None]:
def preprocessing2(text):
  text = text.strip()
  text = text.lower()
  text = remove_hashtags(text)
  text = remove_punctuation(text)
  text = remove_stop_words(text)
  text = remove_dash(text)
  text = replace_emojies(text)
  text = change_to_orgin_lemmatizer(text)
  print(text)
  text = tfidf_vectorizer_lemmatizer.transform(list([text, "hell"]))
  text = text.toarray()
  return text[0].reshape(1, -1)

In [None]:
def preprocessing1(text):
  text = text.strip()
  text = text.lower()
  text = remove_hashtags(text)
  text = remove_punctuation(text)
  text = remove_stop_words(text)
  text = remove_dash(text)
  text = replace_emojies(text)
  text = change_to_orgin_lemmatizer(text)
  print(text)
  text = bow_vectorizer_lemmatizer.transform(list([text, "hell"]))
  text = text.toarray()
  return text[0].reshape(1, -1)

In [None]:
processing = FunctionTransformer(preprocessing1)
sk_pipe = Pipeline([("trans", processing), ("model", model1)])

In [None]:
processing = FunctionTransformer(preprocessing2)
sk_pipe2 = Pipeline([("trans", processing), ("model", model2)])

In [None]:
text = "I hate being sad"
print(sk_pipe.predict(text))
print(sk_pipe2.predict(text))

In [None]:
joblib.dump(sk_pipe2, "sentiment_analysis_model2.pkl")