<a href="https://colab.research.google.com/github/unt-iialab/INFO5731_Spring2020/blob/master/Assignments/INFO5731_Assignment_Four.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **INFO5731 Assignment Four**

In this assignment, you are required to conduct topic modeling, sentiment analysis based on **the dataset you created from assignment three**.

# **Question 1: Topic Modeling**

(30 points). This question is designed to help you develop a feel for the way topic modeling works, the connection to the human meanings of documents. Based on the dataset from assignment three, write a python program to **identify the top 10 topics in the dataset**. Before answering this question, please review the materials in lesson 8, especially the code for LDA, LSA, and BERTopic. The following information should be reported:

(1) Features (text representation) used for topic modeling.

(2) Top 10 clusters for topic modeling.

(3) Summarize and describe the topic for each cluster. 


In [None]:
# Write your code here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
pd.options.display.max_columns = 999
df = pd.read_csv("datafile.csv")
df.head()


In [None]:
data = df.iloc[:, 1]
data.head()

In [8]:
# for Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# lemmatization
import spacy
# To Plot tools
import pyLDAvis
import pyLDAvis.gensim_models
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [9]:
#To convert sentences to words
def sentences_to_words(sentences):
    for sent in sentences:
        yield(gensim.utils.simple_preprocess(str(sent), deacc=True))  # deacc=True removes punctuations

data_words = list(sentences_to_words(data))

['perfect', 'everi', 'aspect']


In [None]:
#1 Building the bigram and trigram models for given
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)#high threshold
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
# making sentence fast and get a sentence joined as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# trigram ex
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Defining functions for stopwords, bigrams, trigrams  lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    t_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        t_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return t_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

In [None]:
# Create Dictionary
id2word_words = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts_1 = data_lemmatized

# Term Document Frequency
corpus = [id2word_words.doc2bow(text) for text in texts_1]

# View
print(corpus[:1])

In [None]:
# Or, you can see a human-readable form of the corpus itself.
# Human readable format of corpus (term-frequency)
[[(id2word_words[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
#2 Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word_words,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
# Print the Keyword in the 10 topics
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
#3
'''TOPIC 1: Regarding the videos showing the struggle that the victim was going through.
TOPIC 2: About black people and their supremacy. 
TOPIC 3: Working and learning together to stop racial injustice. 
TOPIC 4: Regarding the polices being racists.
TOPIC 5: Regarding the racial crimes.
TOPIC 6: Regarding the pain and guilt felt by the families.
TOPIC 7: Regarding the black lives matter movement. 
TOPIC 8: Regarding love, peace and support. 
TOPIC 9: Regarding racial injustice.
TOPIC 10: Regarding justic for who were targeted for hate crime.
'''

# **Question 2: Sentiment Analysis**

(30 points). Sentiment analysis also known as opinion mining is a sub field within Natural Language Processing (NLP) that builds machine learning algorithms to classify a text according to the sentimental polarities of opinions it contains, e.g., positive, negative, neutral. The purpose of this question is to develop a machine learning classifier for sentiment analysis. Based on the dataset from assignment three, write a python program to implement a sentiment classifier and evaluate its performance. Notice: **80% data for training and 20% data for testing**.  

(1) Features used for sentiment classification and explain why you select these features.

(2) Select two of the supervised learning algorithm from scikit-learn library: https://scikit-learn.org/stable/supervised_learning.html#supervised-learning, to build a sentiment classifier respectively. Note: Cross-validation (5-fold or 10-fold) should be conducted. Here is the reference of cross-validation: https://scikit-learn.org/stable/modules/cross_validation.html.

(3) Compare the performance over accuracy, precision, recall, and F1 score for the two algorithms you selected. Here is the reference of how to calculate these metrics: https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9. 

In [0]:
# Write your code here

#1
'''I utilized the TF-IDF vectorizer to extract features from my data corpus since it provides information on the relevance of the words in addition to their frequency, which I believe gives the words more weight when analyzing the various classes and enhancing the precision of our model.'''

data = pd.read_csv("datafile.csv")
data.head()




In [None]:
#Tf-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
p = tfidf_vectorizer.fit_transform(data['clean_text'].apply(lambda p: np.str_(p)))
q = data['sentiment']
from sklearn.model_selection import train_test_split

p_train, p_test, q_train, q_test = train_test_split(p,q,test_size = 0.2, random_state = 202)

print ("p_train", p_train.shape)
print ("p_test", p_test.shape)
print ("q_train", q_train.shape)
print ("q_test", q_test.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(p_train,q_train)
predictions_nb = nb.predict(p_test)
predictions_nb

In [None]:
from sklearn.metrics import classification_report

c = classification_report(q_test, predictions_nb)
print("Classification Report: ", "\n", "\n",c)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies_nb = cross_val_score(estimator = nb, P = p_train, Q = q_train, cv = 10)

print(f" The Accuracy for Naive Bayes Model is :  {round(accuracies_nb.mean()*100)}%")

In [None]:
#svm
from sklearn import svm

t = svm.SVC(kernel='linear')
t.fit(p_train, q_train)
predictions_svm = t.predict(p_test)
predictions_svm

In [None]:
cr_svm = classification_report(q_test, predictions_svm)
print("Classification Report: ", "\n", "\n",cr_svm)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies_svm = cross_val_score(estimator = t, P = p_train, Q = q_train, cv = 10)

print(f"Accuracy of the SVM Model is :  {round(accuracies_svm.mean()*100)}%")
#3
'''The accuracy of the SVM model after 10-fold cross validation is 79% as opposed to 72% for the Naive Bayes model. This shows that the SVM model outperforms the naive bayes model when it comes to categorizing tweets into discrete categories.'''


# **Question 3: House price prediction**

(40 points). You are required to build a **regression** model to predict the house price with 79 explanatory variables describing (almost) every aspect of residential homes. The purpose of this question is to practice regression analysis, an supervised learning model. The training data, testing data, and data description files can be download from canvas. Here is an axample for implementation: https://towardsdatascience.com/linear-regression-in-python-predict-the-bay-areas-home-price-5c91c8378878. 


In [0]:
# Write your code here

train_data = pd.read_csv("train.csv")
train_data.head()


In [None]:
test_data = pd.read_csv("test.csv")
test_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
#to divide the data into numerical, categorical.

train_categorical_data = [var for var in train_data.columns if train_data[var].dtype=='O']
train_numerical_data = [var for var in train_data.columns if train_data[var].dtype=='float']
# imputing the missing values by the attributes mean for all the numerical attributes.

for h in train_numerical_data:
    train_data[h].fillna(value = train_data[h].mean(),inplace = True)
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
#Dividing the data into numerical and categorical.

test_categorical_data = [var for var in test_data.columns if test_data[var].dtype=='O']
test_numerical_data = [var for var in test_data.columns if test_data[var].dtype=='float']
# imputing the missing values by the attributes mean for all the numerical attributes.

for h in test_numerical_data:
    test_data[h].fillna(value = test_data[h].mean(),inplace = True)
#EDA
train_data.hist(bins=40, figsize=(20,20))
plt.show()

In [None]:
from pandas.plotting import scatter_matrix
att = ["SalePrice", "OverallQual", "GrLivArea", "GarageCars", "GarageArea","TotalBsmtSF","1stFlrSF", "FullBath","YearBuilt" ]
scatter_matrix(train_data[att], figsize=(25, 20))

In [None]:
#correlation analysis
plt.figure(figsize = (25,20))
sns.heatmap(train_data.corr(), annot = True)

In [None]:
cmatrix = train_data.corr().abs()
upper_tri = cmatrix.where(np.triu(np.ones(cmatrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.80)]
print("The attributes that needs to be dropped are:\n ",to_drop)

In [None]:
train_data.drop(columns = to_drop, axis = 1, inplace = True)
test_data.drop(columns = to_drop, axis = 1, inplace = True)
# dropping id attributes
train_data.drop(labels = ['Id'], axis = 1, inplace = True)
test_data.drop(labels = ['Id'], axis = 1, inplace = True)
from sklearn.preprocessing import LabelEncoder

for column in train_data.columns:
    encoder = LabelEncoder()
    encoder.fit(list(train_data[column].values))
    train_data[column] = encoder.transform(list(train_data[column].values))
for column in test_data.columns:
    encoder_test = LabelEncoder()
    encoder_test.fit(list(test_data[column].values))
    test_data[column] = encoder_test.transform(list(test_data[column].values))
p_train = train_data.iloc[:,:-1]
q_train = train_data.iloc[:,-1]
p_test = test_data.iloc[:,:]
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(p_train, q_train)
predictions_lr = lin_reg.predict(p_test)
print("R-Square value: ", lin_reg.score(p_train,q_train))

In [None]:
pd.DataFrame({'Predicted House Price Values': predictions_lr})