## Cycle 3 Iteration 4 description 

In this iteration, I build on Iteration 3 and create user interface

In [1]:
import pandas as pd

# USER INPUT 

file_path = input("Please enter the file pathway: ")

try:
    # Attempt to read the file into a DataFrame
    df = pd.read_csv(file_path)  # Assuming the file is a CSV file, change the function accordingly for other file types
    print("DataFrame created successfully:")
    print(df.head())  # Print the first few rows of the DataFrame
    print(df.info())  # Print info about the dataframe 
except FileNotFoundError:
    # Handle the case when the file is not found
    print("Error: File not found.")
except Exception as e:
    # Handle any other exceptions that might occur
    print("An error occurred:", e)

# NULL VALUES CHECK     

# Check for null values 

# Count null values in 'review/text' column before dropping
null_count_before = df['text'].isnull().sum()

# Drop rows with null values in 'review/text' column
df.dropna(subset=['text'], inplace=True)

# Count null values in 'review/text' column after dropping
null_count_after = df['text'].isnull().sum()

# Calculate the count of dropped rows
dropped_count = null_count_before - null_count_after

# Print the count of dropped rows
print("- Preprocessing step 1/9: Null value check done. Number of dropped rows due to null values:", dropped_count)

# TOKENIZING

import nltk
from nltk.tokenize import word_tokenize

# Ensure that you have the 'punkt' tokenizer models downloaded
nltk.download('punkt')

# Convert values in the 'review/text' column to strings and then tokenize
df['tokens'] = df['text'].astype(str).apply(word_tokenize)

print("- Preprocessing step 2/9: Text has been tokenized successfully!")

# PREPROCESSING TOKENIZED DATA 

# Function to lowercase each word in a list of tokens
def lowercase_tokens(tokens_list):
    return [word.lower() for word in tokens_list]

# Apply lowercase conversion to each list of tokens in the 'tokens' column
df.loc[:, 'tokens'] = df['tokens'].apply(lowercase_tokens)
print("- Preprocessing step 3/9: Tokenized text lowercased successfully!")

import re
# Apply punctuation removal using a lambda function
df['tokens'] = df['tokens'].apply(lambda tokens_list: [re.sub(r'[^\w\s]', '', word) for word in tokens_list] if isinstance(tokens_list, list) else [])
print("- Preprocessing step 4/9: Punctuation removed from tokenized text successfully!")

from nltk.corpus import stopwords
# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
# Get the English stopwords from NLTK
stop_words = set(stopwords.words('english'))
# Remove stopwords from the 'tokens' column
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
print("- Preprocessing step 5/9: Stopwords removed from tokenized text successfully!")

# Remove blank text from the list of tokens
df['tokens'] = df['tokens'].apply(lambda tokens_list: [token for token in tokens_list if token.strip() != ''])
print("- Preprocessing step 6/9: Blank text removed from tokenized text successfully!")

# Remove numerical values 
df['tokens'] = df['tokens'].apply(lambda x: [token for token in x if not token.isdigit()])
print("- Preprocessing step 7/9: Numerical values removed from tokenized text successfully!")

# Remove special characters 
df['tokens'] = df['tokens'].apply(lambda x: [re.sub(r'\W', '', token) for token in x])
print("- Preprocessing step 8/9: Special characters removed from tokenized text successfully!")

# Remove words with just one letter 

# Define the target words to count
target_words = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

# Initialize counts for each target word
word_counts = {word: 0 for word in target_words}

# Iterate over the tokens and count occurrences of target words
for tokens_list in df['tokens']:
    for word in tokens_list:
        for target_word in target_words:
            if target_word == word:
                word_counts[target_word] += 1
            elif target_word.strip() == word:
                word_counts[target_word] += 1

df['tokens'] = df['tokens'].apply(lambda tokens_list: [word for word in tokens_list if word not in target_words and word.strip() not in target_words])
print("- Preprocessing step 9/9: One letter words removed from tokenized text successfully!")
print("- Preprocessing complete!")

# TOKEN NORMALIZATION

from nltk.stem import PorterStemmer

# Initialize PorterStemmer
stemmer = PorterStemmer()

# Apply stemming to tokens
df['tokens'] = df['tokens'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x]))
print("Tokenized and preprocessed text data has been stemmed with Porter Stemmer!")

# Reset index

df.reset_index(drop=True, inplace=True)

# WORD EMBEDDING

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np

# Load pre-trained Word2Vec model 
word2vec_model = Word2Vec(df['tokens'], min_count=1)

# Function to apply word embeddings
def apply_word_embeddings(tokens):
    word_vectors = []
    for token in tokens:
        if token in word2vec_model.wv:
            word_vector = word2vec_model.wv[token]  # Get word vector
            word_vectors.append(word_vector)
    if word_vectors:
        tweet_vector = np.mean(word_vectors, axis=0)  # Aggregate word vectors
        return tweet_vector
    else:
        return None

# Apply word embeddings to the 'tokens' column
df['word_embeddings'] = df['tokens'].apply(apply_word_embeddings)

# Drop rows with None values in 'word_embeddings' column
df = df.dropna(subset=['word_embeddings'])
df.head()

Please enter the file pathway:  /Users/matthewbatchelor/Downloads/test_data.csv


DataFrame created successfully:
   unique identifier                                               text
0              72489  The service was exceptional! The staff was fri...
1              53072  I had a terrible experience at your store. The...
2              16947  I'm neutral about my experience at your establ...
3              87532  The online ordering process was smooth and has...
4              41293  I'm disappointed with the quality of the produ...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   unique identifier  20 non-null     int64 
 1   text               20 non-null     object
dtypes: int64(1), object(1)
memory usage: 452.0+ bytes
None
- Preprocessing step 1/9: Null value check done. Number of dropped rows due to null values: 0


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matthewbatchelor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthewbatchelor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


- Preprocessing step 2/9: Text has been tokenized successfully!
- Preprocessing step 3/9: Tokenized text lowercased successfully!
- Preprocessing step 4/9: Punctuation removed from tokenized text successfully!
- Preprocessing step 5/9: Stopwords removed from tokenized text successfully!
- Preprocessing step 6/9: Blank text removed from tokenized text successfully!
- Preprocessing step 7/9: Numerical values removed from tokenized text successfully!
- Preprocessing step 8/9: Special characters removed from tokenized text successfully!
- Preprocessing step 9/9: One letter words removed from tokenized text successfully!
- Preprocessing complete!
Tokenized and preprocessed text data has been stemmed with Porter Stemmer!


Unnamed: 0,unique identifier,text,tokens,word_embeddings
0,72489,The service was exceptional! The staff was fri...,servic except staff friendli help food delici ...,"[-0.008174025, 0.025680337, 0.013656314, 0.022..."
1,53072,I had a terrible experience at your store. The...,terribl experi store staff rude product qualit...,"[-0.010783804, 0.02686394, 0.01255794, 0.02368..."
2,16947,I'm neutral about my experience at your establ...,neutral experi establish servic averag product...,"[-0.01114888, 0.027538154, 0.013205601, 0.0243..."
3,87532,The online ordering process was smooth and has...,onlin order process smooth hasslefre receiv pa...,"[-0.01048251, 0.027087936, 0.014321848, 0.0252..."
4,41293,I'm disappointed with the quality of the produ...,disappoint qualiti product receiv nt meet expect,"[-0.0104486095, 0.025587244, 0.012627092, 0.02..."


In [2]:
# test csv : /Users/matthewbatchelor/Downloads/test_data.csv

## Make predictions 

In [3]:
import numpy as np
import tensorflow as tf
import pickle

# Load the saved model using pickle
with open('best_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Convert word embeddings to numpy array
X_new = np.array(df['word_embeddings'].tolist())

# Standardize features if necessary
scaler = StandardScaler()
X_new_scaled = scaler.transform(X_new)

# Make predictions on the new data
predictions = model.predict(X_new_scaled)

print(predictions)

2024-03-07 13:35:12.251446: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


AttributeError: Can't get attribute 'create_model' on <module '__main__'>