In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from scipy import sparse
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import naive_bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation



import  textblob
from textblob import Word
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer, TfidfTransformer


import re    #for regex

from textblob import TextBlob
from sklearn.model_selection import train_test_split, GridSearchCV

import nltk
nltk.download('stopwords')
nltk.download('wordnet')


# Set the NLTK data path to the local directory




data = pd.read_csv('Womens Clothing Reviews Data.csv')


### 1. Performing Exploratory Analysis on the Data to Understand the Patterns:
 Exploratory analysis involves examining the dataset to get an initial understanding of its structure, variables, and potential patterns. This step is crucial for identifying trends, outliers, and relationships within the data. The goal is to gain insights into customer behaviors and preferences based on their demographics and reviews. Exploratory analysis may include data visualizations such as bar plots, histograms, scatter plots, and summary statistics.

##### a. Characteristic of data

In [None]:
# the 'head()' method is used to retrieve the top rows of the Dataframe. By defult, it returnd the five rows.
data.head()

In [None]:
#this line show about dataset, including datatypes, non-null counts, and memory usage.
data.info()

In [None]:
#summary statistics for numerical columns 
data.describe()

In [None]:
#check null data
data.isnull().sum()

#### b. Data Cleaning 

In [None]:
#drop null data
data.dropna(inplace=True)

In [None]:
#check again if there's any other null data
data.isnull().sum()

In [None]:
#find data duplicate and drop duplicate data
data.drop_duplicates() 

In [None]:
# Move cleaned data to other file
data.to_csv('cleaned data.csv', index=False)

In [None]:
## Data Virtualization

In [None]:
# read the cleaned data 
data = pd.read_csv('cleaned data.csv')


#### c. Exploratory Data Analysis (EDA)


In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Recommend Flag',data=data, palette='YlGnBu_r')

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Rating',data=data, palette='YlGnBu_r')

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Rating',hue='Category',data= data,palette='CMRmap')


In [None]:
plt.figure(figsize=(8,4))
sns.set_style('whitegrid')
sns.countplot(x='Rating',hue='Subcategory1',data= data,palette='viridis')



In [None]:
plt.figure(figsize=(8,4))
sns.set_style('whitegrid')
sns.countplot(x='Recommend Flag',hue='Subcategory1',data=data,palette='YlGnBu_r')



In [None]:
plt.figure(figsize=(8,4))
sns.distplot(data['Customer Age'],color='darkred',bins=30)

### 2. Perform Text Mining Tasks to Understand the Most Frequent Words for Positive and Negative Sentiment. Create Word Clouds for the Positive & Negative Reviews Separately:

Text mining involves analyzing and extracting information from text data. In this case, the objective is to analyze customer reviews submitted on the website. The first step is to process and clean the text data by removing stopwords, special characters, and converting the text to lowercase. Next, sentiment analysis can be performed to classify reviews as positive or negative. Word frequency analysis will help identify the most common words used in positive and negative reviews. Word clouds are visual representations of word frequencies, and separate word clouds can be created for positive and negative sentiments.

In [None]:
# column called 'sentiment' containing the sentiment polarity score for each review text in the 'Review_Text' column. 
# This sentiment score reflects the overall sentiment of the text, whether it is positive, negative, or neutral.

data['Review Text'] = data['Review Text'].astype(str)
data['sentiment'] = data["Review Text"].apply(lambda x: TextBlob(x).sentiment.polarity ) 
data.head()




In [None]:
# Divide the data in to positive, negative and neutral sentiments and add a column sentiment_category
data['sentiment_cat'] = np.where(data.sentiment>0.1,'Positive', np.where(data.sentiment<0.1, 'Negative', 'Nuetral'))
#df['sentiment_cat'] = np.where(df.sentiment>0.1,'Positive', np.where(data.sentiment<-0.1,'Negative', 'Nuetral'))
data.head()


In [None]:
# total positive, negative and neutral sentiments in sentiment_cat using .value_counts()
data.sentiment_cat.value_counts()

#df.sentiment_cat.value_counts()

In [None]:
data.sentiment_cat.value_counts().plot(kind='bar')


In [None]:
plt.savefig('Sentiment_plot')


In [None]:
data.Rating.value_counts()  #rating count 


In [None]:
pd.crosstab(data.sentiment_cat, data.Rating)
#pd.crosstab(df.sentiment_cat, df.Rating)

##### Divide the data into three groups on the basis of sentiments like positive, negative and neutral

In [None]:
data_neg = data[(data.sentiment_cat=='Negative')]
data_pos = data[(data.sentiment_cat=='Positive')]
data_neu = data[(data.sentiment_cat=='Neutral')]


In [None]:
## X-variable is Review_text and y-variable is Rating
# define X and y
X = data['Review Text']
y = data['Rating']

# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
data.shape


##### Split the data into train & Test for positive sentiments and negative sentiments

In [None]:
# create a new DataFrame that only contains the 5 Rating and 1-Rating reviews
#women_clothing = women_clothing[(women_clothing.Rating==5) | (women_clothing.Rating==1)]

# define X and y
X2 = data_pos['Review Text']
y2 = data_pos['Rating']

# split the new DataFrame into training and testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=1)
print(X2_train.shape)
print(X2_test.shape)
print(y2_train.shape)
print(y2_test.shape)

In [None]:
# define X and y
X1 = data_neg['Review Text']
y1 = data_neg['Rating']

# split the new DataFrame into training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=1)
print(X1_train.shape)
print(X1_test.shape)
print(y1_train.shape)
print(y1_test.shape)

In [None]:
data_pos.shape


In [None]:
data_neg.shape


##### Creating user defined functions for clean the text and pre-process the data

In [None]:
#Abbrevations and Words correction
def clean_text(Review_Text):
    Review_Text = Review_Text.lower()
    Review_Text = Review_Text.strip()
    Review_Text = re.sub(r' +', ' ', Review_Text)
    Review_Text = re.sub(r"[-()\"#/@;:{}`+=~|.!?*&£%€¦_><‘|,'0-9]", "", Review_Text)
    Review_Text = Review_Text.replace('wat', 'what').replace('txts', 'texts').replace('vry', 'very').replace('gud', 'good').replace('nyt', 'night').replace('msg', 'message')
    return(Review_Text)

In [None]:
sw = list(set(nltk.corpus.stopwords.words('english')))


In [None]:
stop = list(set(sw + ['the', 'me', 'how', 'what']))
print(stop)

In [None]:
def pre_process(Review_Text):
    Review_Text = Review_Text.str.replace('/','')                           #Replacing the / with none
    Review_Text = Review_Text.apply(lambda x: " ".join(x for x in x.split() if x not in stop)) #Removing stop words
    Review_Text = Review_Text.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))   #lemmatization
    return(Review_Text)

In [None]:
X_train = X_train.apply(lambda x: clean_text(x))
X_test = X_test.apply(lambda x: clean_text(x))

In [None]:
X_train=pre_process(X_train)
X_test=pre_process(X_test)

##### CLean the text and pre-process the data for positive sentiments

In [None]:
X2_train = X2_train.apply(lambda x: clean_text(x))
X2_test = X2_test.apply(lambda x: clean_text(x))

In [None]:
X2_train=pre_process(X2_train)
X2_test=pre_process(X2_test)

In [None]:
X2_train


##### CLean the text and pre-process the data for negative sentiments

In [None]:
X1_train = X1_train.apply(lambda x: clean_text(x))
X1_test = X1_test.apply(lambda x: clean_text(x))

In [None]:
X1_train=pre_process(X1_train)
X1_test=pre_process(X1_test)

##### Vectorization (Count, Tfidf) for positive sentiments

In [None]:
#Train
count_vect2 = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                             ngram_range=(1, 1 ), 
                             min_df=5, 
                             encoding='latin-1' ,
                             max_features=800)
xtrain2_count = count_vect2.fit_transform(X2_train)

In [None]:
xtrain2_count


##### Vectorization (Count, Tfidf) for negative sentiments

In [None]:
count_vect1 = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                             ngram_range=(1, 1 ), 
                             min_df=5, 
                             encoding='latin-1' ,
                             max_features=800)
xtrain1_count = count_vect1.fit_transform(X1_train)

##### View the document term metrics for positive sentiments

In [None]:
dtm=xtrain2_count.toarray()



In [None]:
print(count_vect2.get_feature_names_out())


In [None]:
dtm1=pd.DataFrame(dtm, columns = count_vect2.get_feature_names_out())


In [None]:
dtm1


In [None]:
dtm1.apply(sum)


##### View the document term metrics for negative sentiments

In [None]:
dtm4=xtrain1_count.toarray()


In [None]:
print(count_vect1.get_feature_names_out())


In [None]:
dtm5=pd.DataFrame(dtm4, columns = count_vect1.get_feature_names_out())


In [None]:
dtm5


In [None]:
dtm5.apply(sum)


##### Word frequencies for positive sentiments

In [None]:
word_freq = pd.DataFrame(dtm1.apply(sum).head(40), columns=['freq'])
word_freq.sort_values('freq', ascending=False, inplace=True)

In [None]:
word_freq


In [None]:
word_freq_dictionary = dict(dtm1.apply(sum))


In [None]:
word_freq.plot(kind='bar', color='Green')


##### Word frequencies for negative sentiments

In [None]:

word_freq1 = pd.DataFrame(dtm5.apply(sum).head(40), columns=['freq'])
word_freq1.sort_values('freq', ascending=False, inplace=True)

In [None]:
word_freq_dictionary1 = dict(dtm5.apply(sum))


In [None]:
word_freq_dictionary1


##### For making word_clouds for postive sentiments

In [None]:
from wordcloud import WordCloud ,STOPWORDS
wordcloud = WordCloud(background_color='white', stopwords=stop)

In [None]:
wordcloud = wordcloud.generate_from_frequencies(word_freq_dictionary)

fig = plt.figure(1, figsize=(12, 12))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

In [None]:
plt.savefig('positive_wordcloud')


##### For making word_clouds for negative sentiments

In [None]:
from wordcloud import WordCloud ,STOPWORDS
wordcloud1 = WordCloud(background_color='white', stopwords=stop)

In [None]:
wordcloud1 = wordcloud1.generate_from_frequencies(word_freq_dictionary1)

fig = plt.figure(1, figsize=(12, 12))
plt.axis('off')
plt.imshow(wordcloud1)
plt.show()

In [None]:
plt.savefig('Negative_wordcloud')


#### 3. Understand sentiment among the customers on the different categories, sub categories,products by location and age group

In [None]:
data['age group'] = pd.cut(x= data['Customer Age'],bins=[20, 29, 39, 49,59 ,69,79,89 ,99])


In [None]:
data.head()


In [None]:
data.groupby(['Location', 'age group','Category','Subcategory1','SubCategory2',"sentiment_cat" ]).agg({'sentiment_cat': 'count'})


#### 4.Perform predictive analytics to understand the drivers of customers who are recommending the products.


In [None]:
data.head()


In [None]:
X4 = data['Review Text']
y4 = data['Recommend Flag']

# split the new DataFrame into training and testing sets
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, random_state=1)
print(X4_train.shape)
print(X4_test.shape)
print(y4_train.shape)
print(y4_test.shape)

In [None]:
X4_train = X4_train.apply(lambda x: clean_text(x))
X4_test = X4_test.apply(lambda x: clean_text(x))

In [None]:
#Train
count_vect = CountVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}', 
                             ngram_range=(1, 1 ), 
                             min_df=5, 
                             lowercase = True,
                             encoding='latin-1' , 
                             max_features=100)
X_train_count4 = count_vect.fit_transform(X4_train)
feature_names_count = count_vect.get_feature_names_out()

tfidf_vect = TfidfVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}', 
                             ngram_range=(1, 1 ), 
                             min_df=5, 
                             encoding='latin-1' , 
                             lowercase = True,
                             max_features=100)
X_train_tfidf4 = tfidf_vect.fit_transform(X4_train)
feature_names_tfidf = tfidf_vect.get_feature_names_out()

#Test
X_test_count4 = count_vect.transform(X4_test)
X_test_tfidf4 = tfidf_vect.transform(X4_test)

In [None]:
dtm_count=pd.DataFrame(X_train_count4.toarray(), columns=count_vect.get_feature_names_out())
dtm_tfidf=pd.DataFrame(X_train_tfidf4.toarray(), columns=tfidf_vect.get_feature_names_out())

In [None]:
data.head()


In [None]:
def create_dummies(df, colname):
    col_dummies = pd.get_dummies(df[colname], prefix = colname, drop_first = True)
    df = pd.concat([df, col_dummies], axis = 1)
    df.drop(colname, axis = 1, inplace = True )
    return df

In [None]:
cat_vars = data[['Category', 'Subcategory1', 'SubCategory2', 'sentiment_cat','Location', 'Channel']]

# for c_feature in categorical_features
for c_feature in ['Category', 'Subcategory1', 'SubCategory2', 'sentiment_cat','Location', 'Channel']:
    cat_vars[c_feature] = cat_vars[c_feature].astype('category')
    cat_vars = create_dummies(cat_vars, c_feature)
 

cat_vars.head()

In [None]:
dtm_count


In [None]:
data.head()


In [None]:
data =data.loc[:, [ 'Review Text', 'Customer Age', 'Rating', 'sentiment','Recommend Flag']]


In [None]:
data.head()


In [None]:
data = pd.concat([ data,cat_vars], axis =1)


In [None]:
data.columns


In [None]:
data.head()


In [None]:
#define X and y
feature_cols = [ 'Review Text', 'Customer Age', 'Rating', 'sentiment','Recommend Flag',
                'Category_General Petite', 'Category_Initmates','Subcategory1_Dresses', 'Subcategory1_Intimate',
                'Subcategory1_Jackets','Subcategory1_Tops', 'Subcategory1_Trend',
       'SubCategory2_Casual bottoms', 'SubCategory2_Chemises',
       'SubCategory2_Dresses', 'SubCategory2_Fine gauge',
       'SubCategory2_Intimates', 'SubCategory2_Jackets', 'SubCategory2_Jeans',
       'SubCategory2_Knits', 'SubCategory2_Layering', 'SubCategory2_Legwear',
       'SubCategory2_Lounge', 'SubCategory2_Outerwear', 'SubCategory2_Pants',
       'SubCategory2_Shorts', 'SubCategory2_Skirts', 'SubCategory2_Sleep',
       'SubCategory2_Sweaters', 'SubCategory2_Swim', 'SubCategory2_Trend',
       'sentiment_cat_Nuetral', 'sentiment_cat_Positive', 'Location_Chennai',
       'Location_Gurgaon', 'Location_Mumbai', 'Channel_Web']
X = data[feature_cols]
y = data['Recommend Flag']

#split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
X_train['Review Text']


In [None]:
# use  TfidfVectorizer with Review_Text column only
vect = TfidfVectorizer(lowercase=True, stop_words='english', max_features=100, min_df=5, ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train['Review Text'])
X_test_dtm = vect.transform(X_test['Review Text'])
print(X_train_dtm.shape)
print(X_test_dtm.shape)

# shape of other four feature columns
X_train.drop('Review Text', axis=1).shape

In [None]:
print(vect.get_feature_names_out())


In [None]:
# use  TfidfVectorizer with Review_Text column only
vect = TfidfVectorizer(lowercase=True, stop_words='english', max_features=100, min_df=5, ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train['Review Text'])
X_test_dtm = vect.transform(X_test['Review Text'])
print(X_train_dtm.shape)
print(X_test_dtm.shape)

# shape of other four feature columns
X_train.drop('Review Text', axis=1).shape

In [None]:
print(vect.get_feature_names_out())


In [None]:
# cast other feature columns to float and convert to a sparse matrix
extra = sparse.csr_matrix(X_train.drop('Review Text', axis=1).astype(float))
extra.shape

# combine sparse matrices
X_train_dtm_extra = sparse.hstack((X_train_dtm, extra))
X_train_dtm_extra.shape

# repeat for testing set
extra = sparse.csr_matrix(X_test.drop('Review Text', axis=1).astype(float))
X_test_dtm_extra = sparse.hstack((X_test_dtm, extra))
X_test_dtm_extra.shape

In [None]:
# use logistic regression with text column only
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
print(metrics.accuracy_score(y_test, y_pred_class))

In [None]:
print(dir(logreg))


In [None]:
# Finding the score for validation

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_auc_score,accuracy_score

tr_pred=logreg.predict(X_train_dtm)
y_pred = logreg.predict(X_test_dtm)


trprecision,trrecall,trfscore,trsupport=score(y_train,tr_pred)
tracc=accuracy_score(y_train,tr_pred)
precision,recall,fscore,support=score(y_test,y_pred)
acc=accuracy_score(y_test,y_pred)

In [None]:
# For Training

print('Precision : ',trprecision)
print('\nRecall : ',trrecall)
print('\nF-Score :',trfscore)
print('\nAccuracy : ',tracc)

In [None]:
print('Precision : ',precision)
print('\nRecall : ',recall)
print('\nF-Score :',fscore)
print('\nAccuracy : ',acc)

In [None]:
# use logistic regression with all features
#logreg1 = LogisticRegression(C=1e9)
#logreg1.fit(X_train_dtm_extra, y_train)
#y_pred_class = logreg.predict(X_test_dtm_extra)
#print(metrics.accuracy_score(y_test, y_pred_class))
#Saving model
import pickle
Pkl_Filename = "Pickle_LR_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(logreg, file)

In [None]:
 # Load the Model back from file
with open(Pkl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)

Pickled_LR_Model

In [None]:
# Use the Reloaded Model to 
# Calculate the accuracy score and predict target values

# Calculate the Score 
score = Pickled_LR_Model.score(X_test_dtm, y_test)  
# Print the Score
print("Test score: {0:.2f} %".format(100 * score))  

# Predict the Labels using the reloaded Model
Ypredict = Pickled_LR_Model.predict(X_test_dtm)  
Ypredict

In [None]:
print(vect.get_feature_names_out())


In [None]:
#define X and y
feature_cols = [ 'Review Text', 'Customer Age', 'sentiment','Recommend Flag',
                'Category_General Petite', 'Category_Initmates','Subcategory1_Dresses', 'Subcategory1_Intimate',
                'Subcategory1_Jackets','Subcategory1_Tops', 'Subcategory1_Trend',
       'SubCategory2_Casual bottoms', 'SubCategory2_Chemises',
       'SubCategory2_Dresses', 'SubCategory2_Fine gauge',
       'SubCategory2_Intimates', 'SubCategory2_Jackets', 'SubCategory2_Jeans',
       'SubCategory2_Knits', 'SubCategory2_Layering', 'SubCategory2_Legwear',
       'SubCategory2_Lounge', 'SubCategory2_Outerwear', 'SubCategory2_Pants',
       'SubCategory2_Shorts', 'SubCategory2_Skirts', 'SubCategory2_Sleep',
       'SubCategory2_Sweaters', 'SubCategory2_Swim', 'SubCategory2_Trend',
       'sentiment_cat_Nuetral', 'sentiment_cat_Positive', 'Location_Chennai',
       'Location_Gurgaon', 'Location_Mumbai', 'Channel_Web']
X = data[feature_cols]
y = data['Rating']

#split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# use  TfidfVectorizer with Review_Text column only
vect = TfidfVectorizer(lowercase=True, stop_words='english', max_features=100, min_df=5, ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train['Review Text'])
X_test_dtm = vect.transform(X_test['Review Text'])
print(X_train_dtm.shape)
print(X_test_dtm.shape)

# shape of other four feature columns
X_train.drop('Review Text', axis=1).shape

In [None]:
print(vect.get_feature_names_out())
# use logistic regression with text column only
logreg2 = LogisticRegression(C=1e9)
logreg2.fit(X_train_dtm, y_train)
y_pred_class = logreg2.predict(X_test_dtm)
print(metrics.accuracy_score(y_test, y_pred_class))

In [None]:
#Using KNN model
from sklearn.neighbors import KNeighborsClassifier

model3=KNeighborsClassifier(n_neighbors=5,n_jobs=-1)
model3.fit(X_train_dtm,y_train)

In [None]:
 # Finding the score for validation

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_auc_score,accuracy_score

tr_pred=logreg2.predict(X_train_dtm)
y_pred = logreg2.predict(X_test_dtm)


trprecision,trrecall,trfscore,trsupport=score(y_train,tr_pred)
tracc=accuracy_score(y_train,tr_pred)
precision,recall,fscore,support=score(y_test,y_pred)
acc=accuracy_score(y_test,y_pred)

In [None]:
# For Training

print('Precision : ',trprecision)
print('\nRecall : ',trrecall)
print('\nF-Score :',trfscore)
print('\nAccuracy : ',tracc)

In [None]:
# For Testing

print('Precision : ',precision)
print('\nRecall : ',recall)
print('\nF-Score :',fscore)
print('\nAccuracy : ',acc)

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid,  valid_y, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
#Naive Bayes (With only review_text in X-vribles)
# Naive Bayes on Count Vectors and TF-IDF


accuracy_L1 = train_model(naive_bayes.MultinomialNB(), X_train_dtm, y_train, X_test_dtm, y_test)
print("NB  for L1, TFIDF Vectors: ", accuracy_L1)

In [None]:
#Logistic Regression
# Logistic Regression on Count Vectors and TF-IDF
accuracy_L1 = train_model(LogisticRegression(), X_train_dtm, y_train, X_test_dtm, y_test)
print("LR  for L1, tfidf Vectors: ", accuracy_L1)

# Logistic Regression on Word Level TF IDF Vectors
#accuracy_L1 = train_model(LogisticRegression(), X_train_count, y_train, X_test_count, y_test)
#print("LR  for L1, WordLevel count: ", accuracy_L1)


In [None]:
#Linear SVC
# Linear SVC on Count Vectors and TF-IDF


accuracy_L1 = train_model(svm.LinearSVC(), X_train_dtm, y_train, X_test_dtm, y_test)
print("SVC  for L1, Count Vectors: ", accuracy_L1)

# Linear SVC on Word Level TF IDF Vectors
#accuracy_L1 = train_model(svm.LinearSVC(), X_train_count, y_train, X_test_count, y_test)
#print("SVC  for L1, WordLevel TF-IDF: ", accuracy_L1)

In [None]:
print(vect.get_feature_names_out())


#### e. Create topics and understand themes behind the topics by performing Topic Mining

##### Topic Modeling using gensim

In [None]:
# Importing Gensim
import gensim
from gensim import corpora
X_train_tokens = [doc.split() for doc in X_train]  
X_train_tokens


In [None]:
dictionary = corpora.Dictionary(X_train_tokens)
print(dictionary)

In [None]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in X_train_tokens]


In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel


In [None]:
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary,passes=1)

In [None]:
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary,passes=1)

In [None]:
topics = ldamodel.show_topics(formatted=False, num_words=30)
    
for t in range(len(topics)):
    print("\nTopic {}, top {} words:".format(t+1,30))
    print(" ".join([w[0] for w in topics[t][1]]))

##### Topic Modeling using sklearn.decomposition


In [None]:
# train a LDA Model
from sklearn import decomposition


lda_model = decomposition.LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(X_train_dtm)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names_out()

In [None]:
# view the topic models
n_top_words = 40
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

topic_summaries