***********************     1    *****************************

In [None]:
import pickle
import pandas as pd

def extract_columns(input_file, output_file):
    # Load DataFrame from pickle file
    df = pd.read_pickle(input_file)
    
    # Extract specified columns
    extracted_df = df[['title', 'brand', 'asin']]
    
    # Save extracted data into another pickle file
    extracted_df.to_pickle(output_file)
    print("Selected columns saved to pickle file successfully.")

# Example usage:
input_file = 'df_metadata.pickle'
output_file = 'df_metadata_output.pickle'
extract_columns(input_file, output_file)


In [None]:
import pickle
import pandas as pd

def extract_columns(input_file, output_file, columns):
    # Load DataFrame from pickle file
    df = pd.read_pickle(input_file)
    
    # Extract specified columns
    extracted_df = df[columns]
    
    # Save extracted data into another pickle file
    extracted_df.to_pickle(output_file)
    print("Selected columns saved to pickle file successfully.")

# Specify the columns you want to extract
columns_to_extract = ['overall', 'vote', 'reviewerID', 'reviewTime', 'reviewText', 'asin']

# Example usage:
input_file = 'df_reviews.pickle'
output_file = 'df_reviews_output.pickle'
extract_columns(input_file, output_file, columns_to_extract)


In [None]:
import pandas as pd

def combine_data(review_file, metadata_file, output_file):
    # Load data from pickle files into DataFrames
    df_reviews = pd.read_pickle(review_file)
    df_metadata = pd.read_pickle(metadata_file)
    
    # Merge DataFrames based on 'asin' column
    combined_df = pd.merge(df_reviews, df_metadata, on='asin', how='inner')
    
    # Save combined data into another pickle file
    combined_df.to_pickle(output_file)
    print("Combined data saved to pickle file successfully.")

# Specify the paths to the input pickle files and the output pickle file
review_file = 'df_reviews_output.pickle'
metadata_file = 'df_metadata_output.pickle'
output_file = 'combined_data.pickle'

# Call the function to combine the data
combine_data(review_file, metadata_file, output_file)


***********************************        2            **************************

In [None]:
import pandas as pd

def extract_headphones_data(combined_file, output_file):
    # Load combined data from pickle file into DataFrame
    combined_df = pd.read_pickle(combined_file)
    
    # Print total number of rows before extraction
    print("Total number of rows before extraction:", len(combined_df))
    
    # Filter data based on whether the 'title' column contains 'headphones'
    headphones_data = combined_df[combined_df['title'].str.contains('headphones', case=False)]
    
    # Print total number of rows after extraction
    print("Total number of rows after extraction:", len(headphones_data))
    
    # Save headphones data into another pickle file
    headphones_data.to_pickle(output_file)
    print("Data for headphones saved to pickle file successfully.")

# Specify the path to the input pickle file and the output pickle file
combined_file = 'combined_data.pickle'
output_file = 'headphones_data.pickle'

# Call the function to extract data for 'headphones' from the 'title' column
extract_headphones_data(combined_file, output_file)


***************************    3      ***********************************

In [None]:
import pickle
import pandas as pd

def preprocess_headphones_data(file_path):
    # Load data from pickle file into pandas DataFrame
    df = pd.read_pickle(file_path)
    
    # Replace NaN values with 0
    df.fillna(0, inplace=True)
    
    # Remove duplicate rows
    df.drop_duplicates(inplace=True)
    
    # Report the total number of rows for the product
    total_rows = len(df)
    print("Total number of rows for the product:", total_rows)
    
    # Save the preprocessed DataFrame back to the pickle file
    df.to_pickle('headphones_Clean_data.pickle')
    print("Preprocessing completed and saved to pickle file successfully.")

# Replace 'file_path.pickle' with the actual path to your pickle file
file_path = 'headphones_data.pickle'


try:
    # Perform preprocessing on the headphones data
    preprocess_headphones_data(file_path)

except FileNotFoundError:
    print("File not found. Please provide the correct path to the pickle file.")
except Exception as e:
    print("An error occurred:", e)


********************************  4   ***************************

In [None]:
import pandas as pd

# Load data from the preprocessed pickle file into pandas DataFrame
file_path = 'headphones_Clean_data.pickle'
df = pd.read_pickle(file_path)

# a. Number of Reviews
num_reviews = len(df)
print("a. Number of Reviews:", num_reviews)

# b. Average Rating Score
average_rating_score = df['overall'].mean()
print("b. Average Rating Score:", average_rating_score)

# c. Number of Unique Products
num_unique_products = df['asin'].nunique()
print("c. Number of Unique Products:", num_unique_products)

# d. Number of Good Rating
num_good_ratings = df[df['overall'] >= 3]['overall'].count()
print("d. Number of Good Ratings:", num_good_ratings)

# e. Number of Bad Ratings (Set a threshold of >=3 as ‘Good’ and rest as ‘Bad’)
num_bad_ratings = df[df['overall'] < 3]['overall'].count()
print("e. Number of Bad Ratings:", num_bad_ratings)

# f. Number of Reviews corresponding to each Rating
rating_counts = df['overall'].value_counts().sort_index()
print("f. Number of Reviews corresponding to each Rating:")
print(rating_counts)


******************************************** 5 ***************************

In [None]:
import re
from unidecode import unidecode
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

# Define a dictionary of common acronyms and their expanded forms
acronyms = {
    'lol': 'laughing out loud',
    'brb': 'be right back',
    'btw': 'by the way',
    'omg': 'oh my god',
    'idk': 'I don\'t know',
    'imo': 'in my opinion',
    'imho': 'in my humble opinion',
    'fyi': 'for your information',
    'afaik': 'as far as I know',
    'tbh': 'to be honest',
    # Add more acronyms as needed
}

# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        
        # Remove accented characters
        text = unidecode(text)
        
        # Expand acronyms
        for acronym, expanded_form in acronyms.items():
            text = text.replace(acronym, expanded_form)
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Lemmatization
        tokens = nltk.word_tokenize(text)
        text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
        
        # Convert to lowercase
        text = text.lower()
    else:
        text = ''  # Convert non-string values to empty string
    return text

# Apply preprocessing directly to the 'reviewText' column
df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Save the preprocessed DataFrame to a new pickle file
df.to_pickle('headphones_Clean_data_afterpreprocess.pickle')
print("Text preprocessed and saved to pickle file.")


*********************************** 6  ****************************

In [None]:
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Load the preprocessed DataFrame from pickle file
df = pd.read_pickle('headphones_Clean_data_afterpreprocess.pickle')

# Task a: Top 20 most reviewed brands
top_20_most_reviewed_brands = df['brand'].value_counts().head(20)

# Task b: Top 20 least reviewed brands
bottom_20_least_reviewed_brands = df['brand'].value_counts().tail(20)

# Task c: Most positively reviewed headphone
average_ratings = df.groupby('asin')['overall'].mean()
most_positively_reviewed_headphone = average_ratings.idxmax()

# Task d: Count of ratings for the product over 5 consecutive years
df['year'] = pd.to_datetime(df['reviewTime']).dt.year
ratings_over_5_years = df.groupby('year').size().tail(5)

# Task e: Word Cloud for 'Good' and 'Bad' ratings
good_reviews = ' '.join(df[df['overall'] >= 4]['reviewText'])
bad_reviews = ' '.join(df[df['overall'] < 4]['reviewText'])

wordcloud_good = WordCloud(width=800, height=400).generate(good_reviews)
wordcloud_bad = WordCloud(width=800, height=400).generate(bad_reviews)

# Task f: Distribution of Ratings vs. No. of Reviews
rating_distribution = df['overall'].value_counts()

# Task g: Year with maximum reviews
year_with_max_reviews = df['year'].value_counts().idxmax()

# Task h: Year with the highest number of customers
customers_per_year = df.groupby('year')['reviewerID'].nunique()
year_with_highest_customers = customers_per_year.idxmax()

# Plotting pie chart for Task f
rating_distribution.plot(kind='pie', autopct='%1.1f%%', figsize=(8, 8))
plt.title('Distribution of Ratings')
plt.ylabel('')
plt.show()

# Print results
print("Task a: Top 20 most reviewed brands")
print(top_20_most_reviewed_brands)
print("\nTask b: Top 20 least reviewed brands")
print(bottom_20_least_reviewed_brands)
print("\nTask c: Most positively reviewed headphone")
print("ASIN:", most_positively_reviewed_headphone)
print("\nTask d: Count of ratings for the product over 5 consecutive years")
print(ratings_over_5_years)
print("\nTask e: Word Cloud for 'Good' and 'Bad' ratings")
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_good, interpolation='bilinear')
plt.title('Word Cloud for Good Ratings')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(wordcloud_bad, interpolation='bilinear')
plt.title('Word Cloud for Bad Ratings')
plt.axis('off')
plt.show()

print("\nTask g: Year with maximum reviews")
print("Year:", year_with_max_reviews)
print("\nTask h: Year with the highest number of customers")
print("Year:", year_with_highest_customers)


******************************************   7 *********************************************

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed

# Fit and transform the review text
tfidf_features = tfidf_vectorizer.fit_transform(df['reviewText'])

# Convert to DataFrame for easier handling (optional)
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the transformed features
# Print the TF-IDF features
print("TF-IDF Features:")
print(tfidf_df.head())

# Save the TF-IDF features DataFrame to a CSV file
# tfidf_df.to_csv('7.csv', index=False)
# print("TF-IDF features saved to CSV file.")


****************************************  8  ************************************

In [None]:
# Function to categorize ratings into three classes
def categorize_rating(rating):
    if rating > 3:
        return 'Good'
    elif rating == 3:
        return 'Average'
    else:
        return 'Bad'

# Apply the function to create a new column 'Rating_Class'
df['Rating_Class'] = df['overall'].apply(categorize_rating)

# Display the updated DataFrame
print(df.head())
# Save the updated DataFrame to a pickle file
# Save the updated DataFrame to a CSV file
df.to_csv('8.csv', index=False)
print("DataFrame saved to CSV file with Rating_Class column.")




*************************************  9  ******************************

In [None]:
from sklearn.model_selection import train_test_split

# Define input feature (X) and target variable (y)
X = df['reviewText']
y = df['Rating_Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Display the shapes of the resulting sets
print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)


**********************************  10   ***********************

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import threading

# Assuming X_train and X_test are defined elsewhere
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed

# Fit and transform the review text
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize the models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Logistic Regression": LogisticRegression(),
    "Multinomial Naive Bayes": MultinomialNB()
}

def train_and_evaluate_model(name, model):
    print(f"Training and evaluating {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    report = classification_report(y_test, y_pred, target_names=["Bad", "Average", "Good"], output_dict=True)
    
    # Convert the report to a DataFrame
    report_df = pd.DataFrame(report).transpose()
    
    # Save the DataFrame to a CSV file
    csv_file_name = f'{name}_classification_report.csv'
    report_df.to_csv(csv_file_name)
    print(f"{name} report saved to {csv_file_name}")
    print("="*50)

threads = []
for name, model in models.items():
    thread = threading.Thread(target=train_and_evaluate_model, args=(name, model))
    threads.append(thread)
    thread.start()

# Wait for all threads to finish
for thread in threads:
    thread.join()


************************  11  *************************

In [None]:
import pandas as pd

# Load the preprocessed data
df = pd.read_pickle('headphones_Clean_data_afterpreprocess.pickle')


In [None]:
from scipy.sparse import csr_matrix


# First, aggregate the duplicate entries
pivot_df = df.pivot_table(index='reviewerID', columns='asin', values='overall', aggfunc='mean')

# Next, convert to a sparse CSR matrix
# Note: You must handle NaNs here, possibly by replacing them with zeros, if that's appropriate for your use case.
sparse_matrix = csr_matrix(pivot_df.fillna(0).values)



In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
from scipy.sparse import csr_matrix, coo_matrix

# Load data from pickle file
filename = 'headphones_Clean_data_afterpreprocess.pickle'
data = pd.read_pickle(filename)

# Function to create user-item rating matrix
def create_user_item_matrix(data):
    data['reviewerID'] = data['reviewerID'].astype('category')
    data['asin'] = data['asin'].astype('category')

    data_array = np.array(list(zip(data['overall'].astype(np.int32), data['reviewerID'].cat.codes, data['asin'].cat.codes)))
    user_item_matrix = coo_matrix((data_array[:, 0], (data_array[:, 1], data_array[:, 2])))
    user_item_matrix = user_item_matrix.tocsr()
    return user_item_matrix

# Function to normalize ratings using min-max scaling
def normalize_ratings(user_item_matrix):
    # Normalize ratings to range [0, 1]
    max_rating = user_item_matrix.max()
    min_rating = user_item_matrix.min()
    normalized_matrix = (user_item_matrix - min_rating) / (max_rating - min_rating)
    return normalized_matrix

# Function to calculate cosine similarity for a batch
def calculate_similarity_batch(matrix_batch):
    return cosine_similarity(matrix_batch)

# Function to find nearest neighbors
def find_nearest_neighbors(similarity_matrix, N):
    return np.argsort(similarity_matrix, axis=1)[:, :-N-1:-1]

# Function to predict missing values
def predict_missing_values(user_item_matrix, similarity_matrix, nearest_neighbors):
    predicted_matrix = np.zeros((user_item_matrix.shape[0], user_item_matrix.shape[1]))
    for user in range(user_item_matrix.shape[0]):
        nn_indices = nearest_neighbors[user, :len(nearest_neighbors[user])]
        nn_ratings = user_item_matrix[nn_indices, :].toarray()
        nn_similarities = similarity_matrix[user, nn_indices]
        # Weighted average to predict missing values
        predicted_matrix[user] = np.dot(nn_ratings.T, nn_similarities) / np.sum(nn_similarities)
    return predicted_matrix




# Function to calculate Mean Absolute Error (MAE)
def calculate_mae(actual_matrix, predicted_matrix):
    absolute_errors = np.abs(actual_matrix - predicted_matrix)
    mae = np.mean(absolute_errors[np.where(actual_matrix != 0)])
    return mae

# Function to perform K-Folds validation with batch processing
def k_folds_validation(data, k, N, batch_size=1000):
    kf = KFold(n_splits=k, shuffle=True)

    mae_values = []

    for train_index, val_index in kf.split(data):
        train_data, val_data = data.iloc[train_index], data.iloc[val_index]

        # Create user-item rating matrix from training set
        user_item_matrix = create_user_item_matrix(train_data)

        # Normalize ratings
        normalized_matrix = normalize_ratings(user_item_matrix)

        num_batches = (user_item_matrix.shape[0] - 1) // batch_size + 1

        # Initialize predicted_matrix outside the loop
        predicted_matrix = np.zeros_like(user_item_matrix.toarray())

        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, user_item_matrix.shape[0])

            batch_matrix = normalized_matrix[start_idx:end_idx]
            batch_similarity = calculate_similarity_batch(batch_matrix)

            # Find nearest neighbors only for the current batch
            batch_nearest_neighbors = find_nearest_neighbors(batch_similarity, N)

            # Predict missing values for the current batch
            batch_predicted_matrix = predict_missing_values(batch_matrix, batch_similarity, batch_nearest_neighbors)

            # Map the batch predictions back to the original user-item matrix
            predicted_matrix[start_idx:end_idx] = batch_predicted_matrix

        # Create user-item rating matrix from validation set
        validation_user_item_matrix = create_user_item_matrix(val_data)

        # Calculate MAE
        mae = calculate_mae(validation_user_item_matrix.toarray(), predicted_matrix)
        mae_values.append(mae)

    mean_mae = np.mean(mae_values)
    return mean_mae

# Function to plot MAE against K
def plot_mae(mae_values, k_values):
    import matplotlib.pyplot as plt
    plt.plot(k_values, mae_values)
    plt.xlabel('Number of Neighbors (K)')
    plt.ylabel('Mean Absolute Error (MAE)')
    plt.title('MAE vs K')
    plt.show()

# Parameters
N_values = [10, 20, 30, 40, 50]  # Number of neighbors
K = 5  # Number of folds for K-Folds validation

mae_values = []

for N in N_values:
    print(f"Calculating MAE for N = {N}...")
    mean_mae = k_folds_validation(data, K, N)
    mae_values.append(mean_mae)

plot_mae(mae_values, N_values)


##################  12   #####################33

In [None]:
import pandas as pd

# Load data from pickle file into DataFrame
df = pd.read_pickle('headphones_Clean_data_afterpreprocess.pickle')

# Group by 'asin' (product identifier) and calculate sum of 'overall' ratings
product_sum_ratings = df.groupby('asin')['overall'].sum()

# Sort products based on sum of ratings in descending order and select top 10
top_10_products = product_sum_ratings.sort_values(ascending=False).head(10)

# Print the report
print("**Top 10 Products by User Sum Ratings Report**\n")
print("Product ASIN\t\tSum of Ratings")
print("="*35)
for product_asin in top_10_products.index:
    sum_ratings = top_10_products[product_asin]
    print(f"{product_asin}\t{sum_ratings}")
