In [2]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [5]:

# Load Data
train_df = pd.read_csv('train.csv')
# Separate train and test data within train.csv
train_data  = train_df[train_df['Score'].notna()]  # Rows with known Score values for training
test_data = train_df[train_df['Score'].isna()]    # Rows with missing Score values for testing

# Verify the number of rows in each split
print(f"Number of rows in train_data (with Score): {len(train_data)}")
print(f"Number of rows in test_data (without Score): {len(test_data)}")


Number of rows in train_data (with Score): 1485341
Number of rows in test_data (without Score): 212192


In [None]:
!pip install nltk

import nltk
nltk.download('vader_lexicon')




[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [25]:
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from multiprocessing import Pool
import time
import nltk

# Ensure VADER lexicon is downloaded
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to compute sentiment score
def calculate_vader_sentiment(text):
    # Return the compound sentiment score, a standardized measure of positivity/negativity
    return analyzer.polarity_scores(text)['compound']

# Function to apply sentiment analysis to a chunk of data
def apply_sentiment(chunk):
    return chunk.apply(calculate_vader_sentiment)

# Multiprocessing function for parallel sentiment analysis
def parallelize_sentiment_analysis(data, num_partitions=10):
    # Split the data into chunks for each process
    data_split = np.array_split(data, num_partitions)
    pool = Pool()  # Pool will use as many cores as available by default

    # Map the sentiment analysis function across all partitions
    sentiment_scores = np.concatenate(pool.map(apply_sentiment, data_split))

    pool.close()
    pool.join()

    return sentiment_scores

# Perform sentiment analysis on train and test data, and track time
start_time = time.time()

# Sentiment analysis for 'Text' column in train_df
train_data['Sentiment'] = parallelize_sentiment_analysis(train_data['Text'].fillna(''))
print(f"Sentiment analysis on train data complete. Time elapsed: {time.time() - start_time:.2f} seconds")

# Sentiment analysis for 'Text' column in test_df
test_start_time = time.time()
test_data['Sentiment'] = parallelize_sentiment_analysis(test_data['Text'].fillna(''))
print(f"Sentiment analysis on test data complete. Time elapsed: {time.time() - test_start_time:.2f} seconds")

# Total time for both datasets
end_time = time.time()
print(f"Total Time for Sentiment Analysis on Train and Test Data: {end_time - start_time:.2f} seconds")

# Check the resulting sentiment scores
print(train_data[['Text', 'Sentiment']].head())
print(test_data[['Text', 'Sentiment']].head())


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  return bound(*args, **kwds)


Sentiment analysis on train data complete. Time elapsed: 1197.42 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Sentiment'] = parallelize_sentiment_analysis(test_data['Text'].fillna(''))


Sentiment analysis on test data complete. Time elapsed: 526.92 seconds
Total Time for Sentiment Analysis on Train and Test Data: 1724.34 seconds
                                                Text  Sentiment
0  I'm giving the DVD itself 4 stars, the movie i...     0.9501
1  With all that this movie could have been this ...    -0.6858
2  No disrespect to Ms. Jennifer Hudson, but this...    -0.9543
3  What Women Want isn't a bad film. It's just to...     0.5650
4  I loved Enchanted alot.I didn't see it in Thea...     0.9779
                                                      Text  Sentiment
1485341  This movie was just alright for me. I think pe...     0.9862
1485342  My kids love this movie.  Exciting and fun to ...     0.9389
1485343  The effects were good, not the best but far fr...    -0.4912
1485344  I just got a copy of this S. Korean dvd of "He...     0.9712
1485345  OK, to put this clearly and bluntly... if you ...     0.8141


In [None]:
# Convert Unix timestamp to datetime format for both train and test data
train_data['Time'] = pd.to_datetime(train_data['Time'], unit='s')
test_data['Time'] = pd.to_datetime(test_data['Time'], unit='s')

# Extract year, month, and season from the 'Time' column
train_data['Year'] = train_data['Time'].dt.year
train_data['Month'] = train_data['Time'].dt.month
train_data['Season'] = train_data['Month'] % 12 // 3  # 0=Winter, 1=Spring, 2=Summer, 3=Fall

test_data['Year'] = test_data['Time'].dt.year
test_data['Month'] = test_data['Time'].dt.month
test_data['Season'] = test_data['Month'] % 12 // 3

# Calculate Helpfulness Ratio and fill NaN values with zero
train_data['HelpfulnessRatio'] = train_data['HelpfulnessNumerator'] / (train_data['HelpfulnessDenominator'] + 1)
test_data['HelpfulnessRatio'] = test_data['HelpfulnessNumerator'] / (test_data['HelpfulnessDenominator'] + 1)

train_data['HelpfulnessRatio'].fillna(0, inplace=True)
test_data['HelpfulnessRatio'].fillna(0, inplace=True)

# Select relevant features for KNN
train_features = train_data[['Sentiment', 'Year', 'Month', 'Season', 'HelpfulnessRatio']]
test_features = test_data[['Sentiment', 'Year', 'Month', 'Season', 'HelpfulnessRatio']]

# Target variable
y = train_df['Score']

# Data Sampling for Cross-Validation
sampled_train_features = train_features.sample(n=200000, random_state=42)
sampled_y = y[sampled_train_features.index]

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(sampled_train_features)
X_test_scaled = scaler.transform(test_features)

# Optional: Dimensionality Reduction if needed
pca = PCA(n_components=3, random_state=42)
X_reduced = pca.fit_transform(X_scaled)
X_test_reduced = pca.transform(X_test_scaled)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_reduced, sampled_y, test_size=0.2, random_state=42)

# Set up KNN with cross-validation for 'n_neighbors' tuning
param_grid = {'n_neighbors': [31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63]}
knn = KNeighborsClassifier()

# Grid search with 3-fold cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters and score from cross-validation
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)


In [13]:

# Train final model with best parameters on full training set
best_knn = grid_search.best_estimator_
best_knn.fit(X_train, y_train)

# Validate model on validation set
y_val_pred = best_knn.predict(X_val)
print("Validation Report:\n", classification_report(y_val, y_val_pred))

# Predict on Test Set
test_predictions = best_knn.predict(X_test_reduced)

# Prepare submission
submission = pd.DataFrame({'Id': test_data['Id'], 'Score': test_predictions})
submission.to_csv('submission.csv', index=False)
print("Submission file created!")

Validation Report:
               precision    recall  f1-score   support

         1.0       0.22      0.14      0.17      2455
         2.0       0.15      0.01      0.02      2335
         3.0       0.19      0.01      0.02      4713
         4.0       0.22      0.02      0.03      9045
         5.0       0.55      0.96      0.70     21452

    accuracy                           0.53     40000
   macro avg       0.27      0.23      0.19     40000
weighted avg       0.39      0.53      0.40     40000

Submission file created!
