<h1>Text Mining</h1>

<h3>Natural Language Toolkit</h3>
<p><a href ="https://www.nltk.org/">https://www.nltk.org/</a></p>

In [7]:
# load up libraries
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Matt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Matt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Matt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Matt\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Matt\AppData\Roaming\nltk_data...


True

In [2]:
# load the dataframe
review = pd.read_csv("bartini_reviews.csv")

# remove na values

review = review.dropna()


# change Date to date value

review["Date"] = pd.to_datetime(review["Date"])
review


Unnamed: 0,Date,Username,Rating,Review
1,2023-11-16,Donald F.,5.0,Went up to see the daughter in Morgantown! Wan...
2,2024-05-03,Marianne D.,3.0,The bartender was amazing and he made really g...
3,2023-07-29,Brooke T.,1.0,Shown are the appetizers of Cheesy Nachos with...
4,2024-02-17,Sara L.,3.0,We walked in and immediately felt under dresse...
5,2024-03-12,Jen G.,5.0,Food was great. Service wonderful. One of the ...
...,...,...,...,...
115,2013-08-15,Jimmy G.,5.0,What a great find in Morgantown! Upscale marti...
116,2015-02-09,Sharon M.,1.0,Spent two nights at the Hilton Garden Inn. Cou...
117,2016-12-22,S M.,3.0,I have come here many many times. Usually the...
118,2014-10-01,J L.,1.0,The owner pulled a fast one and changed happy ...


<h3>Tokenize</h3>
<p>Splitting the text into individual words or "tokens"</p>

In [3]:
review['tokens'] = review['Review'].apply(word_tokenize)
review

Unnamed: 0,Date,Username,Rating,Review,tokens
1,2023-11-16,Donald F.,5.0,Went up to see the daughter in Morgantown! Wan...,"[Went, up, to, see, the, daughter, in, Morgant..."
2,2024-05-03,Marianne D.,3.0,The bartender was amazing and he made really g...,"[The, bartender, was, amazing, and, he, made, ..."
3,2023-07-29,Brooke T.,1.0,Shown are the appetizers of Cheesy Nachos with...,"[Shown, are, the, appetizers, of, Cheesy, Nach..."
4,2024-02-17,Sara L.,3.0,We walked in and immediately felt under dresse...,"[We, walked, in, and, immediately, felt, under..."
5,2024-03-12,Jen G.,5.0,Food was great. Service wonderful. One of the ...,"[Food, was, great, ., Service, wonderful, ., O..."
...,...,...,...,...,...
115,2013-08-15,Jimmy G.,5.0,What a great find in Morgantown! Upscale marti...,"[What, a, great, find, in, Morgantown, !, Upsc..."
116,2015-02-09,Sharon M.,1.0,Spent two nights at the Hilton Garden Inn. Cou...,"[Spent, two, nights, at, the, Hilton, Garden, ..."
117,2016-12-22,S M.,3.0,I have come here many many times. Usually the...,"[I, have, come, here, many, many, times, ., Us..."
118,2014-10-01,J L.,1.0,The owner pulled a fast one and changed happy ...,"[The, owner, pulled, a, fast, one, and, change..."


<h3>Stopwords</h3>
<p>Removing common words that do not carry significant meaning</p>

In [4]:
stop_words = set(stopwords.words('english'))
review['tokens'] = review['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words and "'" not in word and "`" not in word])
review

Unnamed: 0,Date,Username,Rating,Review,tokens
1,2023-11-16,Donald F.,5.0,Went up to see the daughter in Morgantown! Wan...,"[Went, see, daughter, Morgantown, !, Wanted, s..."
2,2024-05-03,Marianne D.,3.0,The bartender was amazing and he made really g...,"[bartender, amazing, made, really, good, drink..."
3,2023-07-29,Brooke T.,1.0,Shown are the appetizers of Cheesy Nachos with...,"[Shown, appetizers, Cheesy, Nachos, prime, fil..."
4,2024-02-17,Sara L.,3.0,We walked in and immediately felt under dresse...,"[walked, immediately, felt, dressed, ,, rolled..."
5,2024-03-12,Jen G.,5.0,Food was great. Service wonderful. One of the ...,"[Food, great, ., Service, wonderful, ., One, b..."
...,...,...,...,...,...
115,2013-08-15,Jimmy G.,5.0,What a great find in Morgantown! Upscale marti...,"[great, find, Morgantown, !, Upscale, martini,..."
116,2015-02-09,Sharon M.,1.0,Spent two nights at the Hilton Garden Inn. Cou...,"[Spent, two, nights, Hilton, Garden, Inn, ., C..."
117,2016-12-22,S M.,3.0,I have come here many many times. Usually the...,"[come, many, many, times, ., Usually, food, st..."
118,2014-10-01,J L.,1.0,The owner pulled a fast one and changed happy ...,"[owner, pulled, fast, one, changed, happy, hou..."


<h3>Remove Punctation</h3>

In [5]:
review['tokens'] = review['tokens'].apply(lambda x: [word for word in x if word not in string.punctuation])

<h3>Lemmatization</h3>
<p>Reducing words to their base or root form</p>

In [6]:
lemmatizer = WordNetLemmatizer()
review['tokens'] = review['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

LookupError: 
**********************************************************************
  Resource [93momw-1.4[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('omw-1.4')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/omw-1.4[0m

  Searched in:
    - 'C:\\Users\\Matt/nltk_data'
    - 'C:\\Users\\Matt\\anaconda3\\nltk_data'
    - 'C:\\Users\\Matt\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\Matt\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Matt\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


<h2>Basic Analysis</h2>

In [None]:
!pip install textblob

In [None]:
# load some libraries

from collections import Counter
import matplotlib.pyplot as plt
from textblob import TextBlob


<h3>Word Frequency Analysis</h3>

In [None]:
word_freq = Counter(word for sublist in review['tokens'] for word in sublist)
top_words = word_freq.most_common(10)
print("Top 10 frequent words:", top_words)

In [None]:
# visualize

# Visualization of Word Frequencies
plt.bar(*zip(*top_words))
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10 Frequent Words')
plt.xticks(rotation=45)
plt.show()

<h3>Vader Sentiment Analyzer</h3>
<p><a href ="https://vadersentiment.readthedocs.io/e">https://vadersentiment.readthedocs.io/</a></p>

In [None]:
# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

In [None]:
# Function to analyze sentiment and return sentiment label
def analyze_sentiment(tokens):
    text = ' '.join(tokens)
    scores = sid.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return "Positive"
    elif scores['compound'] <= -0.05:
        return "Negative"
    else:
        return "Neutral"


In [None]:
# Apply sentiment analysis to each row of the DataFrame
review['Opinion'] = review['tokens'].apply(analyze_sentiment)

In [None]:
# Print the DataFrame with sentiment analysis results
print(review[['Review', 'Opinion']])

In [None]:
#bar plot for opinion

import seaborn as sns

sns.countplot(data = review, x = "Opinion")

# Add title and labels
plt.title('Opinion Sentiment')
plt.xlabel('')
plt.ylabel('')

# Show plot
plt.show()


In [None]:
# Apply sentiment analysis to each row of the DataFrame
review['Opinion'] = review['tokens'].apply(analyze_sentiment)

# Aggregate sentiment over time
review['month'] = review['Date'].dt.to_period('M')
sentiment_over_time = review.groupby(['month', 'Opinion']).size().unstack(fill_value=0).reset_index()

# Convert period to datetime
sentiment_over_time['month'] = sentiment_over_time['month'].dt.to_timestamp()

# Melt the DataFrame for seaborn compatibility
sentiment_melted = sentiment_over_time.melt(id_vars='month', value_vars=['Positive', 'Negative', 'Neutral'], var_name='Sentiment', value_name='Count')

# Plot the sentiment trends over time using seaborn
plt.figure(figsize=(12, 6))
sns.scatterplot(data=sentiment_melted, x='month', y='Count', hue='Sentiment', style='Sentiment', s=100)
plt.title('Sentiment Analysis Over Time')
plt.xlabel('Month')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
