In [137]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle
import os

# Define file paths
train_csv_path = 'train.csv'
test_csv_path = 'test.csv'
train_pickle_path = 'train.pkl'
test_pickle_path = 'test.pkl'

# Check if pickle files exist
if os.path.exists(train_pickle_path) and os.path.exists(test_pickle_path):
    # Load DataFrames from pickle files
    with open(train_pickle_path, 'rb') as f:
        train_df = pickle.load(f)
    with open(test_pickle_path, 'rb') as f:
        test_df = pickle.load(f)
    print("DataFrames loaded from pickle files.")
else:
    # Load DataFrames from CSV files
    train_df = pd.read_csv(train_csv_path)
    test_df = pd.read_csv(test_csv_path)
    print("DataFrames loaded from CSV files.")

    # Save DataFrames to pickle files for future use
    with open(train_pickle_path, 'wb') as f:
        pickle.dump(train_df, f)
    with open(test_pickle_path, 'wb') as f:
        pickle.dump(test_df, f)
    print("DataFrames saved to pickle files for future use.")

# Display the first few rows of the training data
print("First few rows of train_df:")
print(train_df.head())

# Display the first few rows of the test data
print("\nFirst few rows of test_df:")
print(test_df.head())


DataFrames loaded from pickle files.
First few rows of train_df:
        Id   ProductId          UserId  HelpfulnessNumerator  \
0   914403  B0009W5KHM   AV6QDP8Q0ONK4                     2   
1   354887  6303079709  A2I8RXJN80A2D2                     0   
2  1407653  B004H0M2XC  A3FHV3RV8Z12E6                     0   
3  1377458  B003ZJ9536  A12VLTA3ZHVPUY                     1   
4   475323  630574453X  A13NM1PES9OXVN                     2   

   HelpfulnessDenominator        Time  \
0                       2  1341014400   
1                       0  1168819200   
2                       0  1386201600   
3                       1  1348704000   
4                       3   970012800   

                                         Summary  \
0                                  GOOD FUN FILM   
1                                   Movie Review   
2             When is it a good time to Consent?   
3                                          TRUTH   
4  Intelligent and bittersweet -- stays wit

In [138]:
# Check for missing values in train_df
print("Missing values in train_df:")
print(train_df.isnull().sum())

# Check for missing values in test_df
print("\nMissing values in test_df:")
print(test_df.isnull().sum())
# Examine data types in train_df
print("\nData types in train_df:")
print(train_df.dtypes)

# Examine data types in test_df
print("\nData types in test_df:")
print(test_df.dtypes)
# Check for duplicates in train_df
duplicate_rows = train_df[train_df.duplicated()]
print(f"\nNumber of duplicate rows in train_df: {duplicate_rows.shape[0]}")
# Statistical summary of numerical features
print("\nStatistical summary of numerical features in train_df:")
print(train_df.describe())

# Statistical summary of categorical features
print("\nValue counts for 'Score' in train_df:")
print(train_df['Score'].value_counts(dropna=False))
# Separate rows with missing 'Score' in train_df
missing_score_df = train_df[train_df['Score'].isnull()]
print(f"\nNumber of rows with missing 'Score' in train_df: {missing_score_df.shape[0]}")

# Keep only rows with non-missing 'Score' for training
train_df = train_df[train_df['Score'].notnull()]

# Convert 'Score' to integer type
train_df['Score'] = train_df['Score'].astype(int)
print("\nAfter removing missing 'Score', the shape of train_df:")
print(train_df.shape)


Missing values in train_df:
Id                             0
ProductId                      0
UserId                         0
HelpfulnessNumerator           0
HelpfulnessDenominator         0
Time                           0
Summary                       32
Text                          62
Score                     212192
dtype: int64

Missing values in test_df:
Id            0
Score    212192
dtype: int64

Data types in train_df:
Id                          int64
ProductId                  object
UserId                     object
HelpfulnessNumerator        int64
HelpfulnessDenominator      int64
Time                        int64
Summary                    object
Text                       object
Score                     float64
dtype: object

Data types in test_df:
Id         int64
Score    float64
dtype: object

Number of duplicate rows in train_df: 0

Statistical summary of numerical features in train_df:
                 Id  HelpfulnessNumerator  HelpfulnessDenominator  \
count 

In [139]:
# Number of missing 'Text' values after removing missing 'Score' rows
missing_text_count = train_df['Text'].isnull().sum()
print(f"Number of missing 'Text' values in train_df: {missing_text_count}")
# Remove rows with missing 'Text'
train_df = train_df[train_df['Text'].notnull()]
print(f"After removing missing 'Text' rows, the shape of train_df: {train_df.shape}")
# Fill missing 'Summary' values with empty string
train_df['Summary'] = train_df['Summary'].fillna('')

# Verify that there are no missing 'Summary' values
missing_summary_count = train_df['Summary'].isnull().sum()
print(f"Number of missing 'Summary' values in train_df after imputation: {missing_summary_count}")
# Convert 'Time' from UNIX timestamp to datetime
train_df['ReviewTime'] = pd.to_datetime(train_df['Time'], unit='s')

# Display the first few 'ReviewTime' entries
print("\nFirst few 'ReviewTime' entries:")
print(train_df['ReviewTime'].head())
# Extract year, month, day of week
train_df['ReviewYear'] = train_df['ReviewTime'].dt.year
train_df['ReviewMonth'] = train_df['ReviewTime'].dt.month
train_df['ReviewDayOfWeek'] = train_df['ReviewTime'].dt.dayofweek

# Calculate the age of the review in days
current_time = pd.to_datetime('now')
train_df['ReviewAgeDays'] = (current_time - train_df['ReviewTime']).dt.days

# Display the first few rows of new time features
print("\nFirst few rows of time-based features:")
print(train_df[['ReviewYear', 'ReviewMonth', 'ReviewDayOfWeek', 'ReviewAgeDays']].head())
# Avoid division by zero
train_df['HelpfulnessDenominator'] = train_df['HelpfulnessDenominator'].replace(0, np.nan)

# Calculate HelpfulnessRatio
train_df['HelpfulnessRatio'] = train_df['HelpfulnessNumerator'] / train_df['HelpfulnessDenominator']

# Fill NaN values with 0 (assuming no helpfulness feedback)
train_df['HelpfulnessRatio'] = train_df['HelpfulnessRatio'].fillna(0.0)

# Display the first few rows of 'HelpfulnessRatio'
print("\nFirst few 'HelpfulnessRatio' values:")
print(train_df['HelpfulnessRatio'].head())
# Combine 'Summary' and 'Text' into 'ReviewText'
train_df['ReviewText'] = train_df['Summary'] + ' ' + train_df['Text']

# Display the first few 'ReviewText' entries
print("\nFirst few 'ReviewText' entries:")
print(train_df['ReviewText'].head())


Number of missing 'Text' values in train_df: 54
After removing missing 'Text' rows, the shape of train_df: (1485287, 9)
Number of missing 'Summary' values in train_df after imputation: 0

First few 'ReviewTime' entries:
0   2012-06-30
1   2007-01-15
2   2013-12-05
3   2012-09-27
4   2000-09-27
Name: ReviewTime, dtype: datetime64[ns]

First few rows of time-based features:
   ReviewYear  ReviewMonth  ReviewDayOfWeek  ReviewAgeDays
0        2012            6                5           4502
1        2007            1                0           6495
2        2013           12                3           3979
3        2012            9                3           4413
4        2000            9                2           8796

First few 'HelpfulnessRatio' values:
0    1.000000
1    0.000000
2    0.000000
3    1.000000
4    0.666667
Name: HelpfulnessRatio, dtype: float64

First few 'ReviewText' entries:
0    GOOD FUN FILM While most straight to DVD films...
1    Movie Review I have wanted this

In [140]:
# Text processing libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download NLTK data files 
nltk.download('stopwords')
# Initialize stop words and stemmer
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    # Tokenize
    tokens = text.split()
    # Remove stop words and stem words
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text
# Check if preprocessed text is already saved
preprocessed_text_pickle = 'preprocessed_text.pkl'

if os.path.exists(preprocessed_text_pickle):
    with open(preprocessed_text_pickle, 'rb') as f:
        train_df['CleanReviewText'] = pickle.load(f)
    print("Loaded preprocessed text from pickle file.")
else:
    # Apply preprocessing to 'ReviewText'
    train_df['CleanReviewText'] = train_df['ReviewText'].apply(preprocess_text)
    print("Text preprocessing completed.")

    # Save the preprocessed text to a pickle file
    with open(preprocessed_text_pickle, 'wb') as f:
        pickle.dump(train_df['CleanReviewText'], f)
    print("Preprocessed text saved to pickle file.")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pengrui_f/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loaded preprocessed text from pickle file.


In [141]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=11000) 

# Check if TF-IDF features are already saved
tfidf_features_pickle = 'tfidf_features.pkl'

if os.path.exists(tfidf_features_pickle):
    with open(tfidf_features_pickle, 'rb') as f:
        X_text_tfidf = pickle.load(f)
    print("Loaded TF-IDF features from pickle file.")
else:
    # Fit and transform the 'CleanReviewText'
    X_text_tfidf = tfidf_vectorizer.fit_transform(train_df['CleanReviewText'])
    print("TF-IDF vectorization completed.")

    # Save the TF-IDF features to a pickle file
    with open(tfidf_features_pickle, 'wb') as f:
        pickle.dump(X_text_tfidf, f)
    print("TF-IDF features saved to pickle file.")


TF-IDF vectorization completed.
TF-IDF features saved to pickle file.


In [142]:
# Calculate ReviewLength
train_df['ReviewLength'] = train_df['CleanReviewText'].apply(lambda x: len(x.split()))

# Select numerical features
# numerical_features = ['HelpfulnessRatio', 'ReviewLength', 'ReviewYear', 'ReviewMonth', 'ReviewDayOfWeek', 'ReviewAgeDays']
numerical_features = ['HelpfulnessRatio', 'ReviewYear']

# Extract numerical feature matrix
X_numerical = train_df[numerical_features].values


In [143]:
from sklearn.preprocessing import StandardScaler

# Standardize numerical features
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)

In [144]:
# Combine TF-IDF features and scaled numerical features
from scipy.sparse import hstack
X = hstack([X_text_tfidf, X_numerical_scaled])
y = train_df['Score']


In [145]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")



Training set size: (1113965, 11002)
Validation set size: (371322, 11002)


In [146]:
from sklearn.linear_model import SGDClassifier

# Initialize the model

model = SGDClassifier(
    loss='log_loss',  # For logistic regression
    max_iter=500,
    tol=1e-3,
    n_jobs=-1,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=5,
    class_weight='balanced'  # Handle class imbalance
)
# Train the model
model.fit(X_train, y_train)
print("Model training completed.")


Model training completed.


In [147]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.6304

Classification Report:
              precision    recall  f1-score   support

           1       0.51      0.63      0.56     22797
           2       0.36      0.36      0.36     22418
           3       0.43      0.35      0.39     44019
           4       0.47      0.35      0.40     83804
           5       0.75      0.84      0.79    198284

    accuracy                           0.63    371322
   macro avg       0.50      0.51      0.50    371322
weighted avg       0.61      0.63      0.62    371322



In [148]:
# Save the model
model_pickle = 'trained_model.pkl'
with open(model_pickle, 'wb') as f:
    pickle.dump(model, f)
print("Trained model saved to pickle file.")

Trained model saved to pickle file.


In [149]:
# Ensure necessary libraries are imported
import pandas as pd
import numpy as np
import pickle
import os
from scipy.sparse import hstack

# Assuming that 'model', 'scaler', 'tfidf_vectorizer', 'preprocess_text', 'stop_words', and 'stemmer' are available

# 1. Load 'test.csv' if not already loaded
test_csv_path = 'test.csv'
if 'test_df' not in globals():
    test_df = pd.read_csv(test_csv_path)
    print("Loaded 'test.csv' into test_df.")

# 2. Load the full 'train.csv' to get the test data with features
train_full_pickle_path = 'train_full.pkl'
if os.path.exists(train_full_pickle_path):
    with open(train_full_pickle_path, 'rb') as f:
        train_full_df = pickle.load(f)
    print("Full train data loaded from pickle file.")
else:
    train_full_df = pd.read_csv('train.csv')
    with open(train_full_pickle_path, 'wb') as f:
        pickle.dump(train_full_df, f)
    print("Full train data loaded from CSV and saved to pickle.")

# 3. Merge test_df with train_full_df to get test_data, preserving the order of test_df
test_data = pd.merge(test_df[['Id']], train_full_df, on='Id', how='left')
print(f"Test data shape after merge: {test_data.shape}")

# 4. Handle missing values in 'Text' and 'Summary'
test_data['Text'] = test_data['Text'].fillna('')
test_data['Summary'] = test_data['Summary'].fillna('')

# 5. Combine 'Summary' and 'Text' into 'ReviewText'
test_data['ReviewText'] = test_data['Summary'] + ' ' + test_data['Text']

# 6. Convert 'Time' from UNIX timestamp to datetime
test_data['ReviewTime'] = pd.to_datetime(test_data['Time'], unit='s')

# 7. Calculate 'ReviewAgeDays'
current_time = pd.to_datetime('now')
test_data['ReviewYear'] = test_data['ReviewTime'].dt.year
test_data['ReviewAgeDays'] = (current_time - test_data['ReviewTime']).dt.days

# 8. Handle 'HelpfulnessDenominator' to avoid division by zero
test_data['HelpfulnessDenominator'] = test_data['HelpfulnessDenominator'].replace(0, np.nan)

# 9. Calculate 'HelpfulnessRatio'
test_data['HelpfulnessRatio'] = test_data['HelpfulnessNumerator'] / test_data['HelpfulnessDenominator']
test_data['HelpfulnessRatio'] = test_data['HelpfulnessRatio'].fillna(0.0)

# 10. Preprocess 'ReviewText' in test_data
preprocessed_test_text_pickle = 'preprocessed_test_text.pkl'

if os.path.exists(preprocessed_test_text_pickle):
    with open(preprocessed_test_text_pickle, 'rb') as f:
        test_data['CleanReviewText'] = pickle.load(f)
    print("Loaded preprocessed test text from pickle file.")
else:
    # Apply preprocessing to 'ReviewText'
    test_data['CleanReviewText'] = test_data['ReviewText'].apply(preprocess_text)
    print("Test text preprocessing completed.")
    
    # Save the preprocessed text to a pickle file
    with open(preprocessed_test_text_pickle, 'wb') as f:
        pickle.dump(test_data['CleanReviewText'], f)
    print("Preprocessed test text saved to pickle file.")

Full train data loaded from pickle file.
Test data shape after merge: (212192, 9)
Loaded preprocessed test text from pickle file.


In [150]:
# 11. Transform 'CleanReviewText' in test_data using the same TF-IDF vectorizer
X_test_text_tfidf = tfidf_vectorizer.transform(test_data['CleanReviewText'])

# 12. Extract and standardize numerical features
numerical_features = ['HelpfulnessRatio', 'ReviewYear']
X_test_numerical = test_data[numerical_features].values
X_test_numerical_scaled = scaler.transform(X_test_numerical)

# 13. Combine TF-IDF features and scaled numerical features
X_test = hstack([X_test_text_tfidf, X_test_numerical_scaled])

# 14. Predict 'Score' for test_data
y_test_pred = model.predict(X_test)

# 15. Create submission DataFrame, ensuring order matches 'test_df'
submission_df = test_df.copy()
submission_df['Score'] = y_test_pred

# 16. Save to 'submission.csv' without index
submission_df[['Id', 'Score']].to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully.")


Submission file 'submission.csv' created successfully.
