# Data Pre- Processing for NLP

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [4]:
# Importing the saved data from EDA

data = pd.read_csv('EDA_reviews_amazon.csv')

In [9]:
data.head()

Unnamed: 0,Rating,Rating_Sentiment,Review_Title,Review,Review_str_len,Title_str_len,title_word_tokenize,Review_word_tokenize,title_wtoken_cnt,Review_wtoken_cnt,...,works,worth,written,year,years,likert_scale,lexical_diversity,PCA1,PCA2,PCA3
0,3,Neutral,more like funchuck,gave this to my dad for a gag gift after direc...,93,18,"['more', 'like', 'funchuck']","['gave', 'this', 'to', 'my', 'dad', 'for', 'a'...",3,20,...,0.0,0.0,0.0,0.0,0.0,Average,0.16,-0.10098,-0.045847,0.022069
1,5,Positive,inspiring,i hope a lot of people hear this cd we need m...,204,9,['inspiring'],"['i', 'hope', 'a', 'lot', 'of', 'people', 'hea...",1,38,...,0.0,0.0,0.0,0.0,0.0,Good,0.086538,-0.085753,0.003868,-0.122467
2,5,Positive,the best soundtrack ever to anything,i m reading a lot of reviews saying that this ...,470,37,"['the', 'best', 'soundtrack', 'ever', 'to', 'a...","['i', 'm', 'reading', 'a', 'lot', 'of', 'revie...",6,96,...,0.0,0.259375,0.0,0.0,0.248071,Good,0.035904,-0.113531,-0.050741,-0.071864
3,5,Positive,too good to be true,probably the greatest soundtrack in history u...,377,19,"['too', 'good', 'to', 'be', 'true']","['probably', 'the', 'greatest', 'soundtrack', ...",5,67,...,0.0,0.383432,0.0,0.0,0.0,Good,0.051327,-0.127917,-0.052078,0.012366
4,5,Positive,there s a reason for the price,there s a reason this cd is so expensive even...,193,30,"['there', 's', 'a', 'reason', 'for', 'the', 'p...","['there', 's', 'a', 'reason', 'this', 'cd', 'i...",7,41,...,0.0,0.0,0.0,0.0,0.0,Good,0.083067,-0.123744,5e-06,-0.144462


In [14]:
# Step1 defining features and target

# Define the target variable (y) and features (X)
# The `Rating_Sentiment` column appears to be the target variable based on the data structure.
# Features will be all numerical columns from 'Review_str_len' to the end.
#X = data.loc[:, 'Review_str_len':'years']
# Drop non-numeric columns except target/label
X = data.drop(['Review_Title', 'title_word_tokenize','Review_word_tokenize','Review', 'likert_scale', 'Cleaned_Review', 'review_punc_stop_words_removed', 'porterStemmer', 'WordNet_Lemmatizer', 'Rating_Sentiment'], axis=1)
y = data['Rating_Sentiment']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (1800000, 110)
Target shape: (1800000,)


In [15]:
X.head()

Unnamed: 0,Rating,Review_str_len,Title_str_len,title_wtoken_cnt,Review_wtoken_cnt,review_removed_cnt,actually,album,author,bad,...,work,works,worth,written,year,years,lexical_diversity,PCA1,PCA2,PCA3
0,3,93,18,3,20,9,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.16,-0.10098,-0.045847,0.022069
1,5,204,9,1,38,22,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.086538,-0.085753,0.003868,-0.122467
2,5,470,37,6,96,34,0.0,0.0,0.0,0.0,...,0.0,0.0,0.259375,0.0,0.0,0.248071,0.035904,-0.113531,-0.050741,-0.071864
3,5,377,19,5,67,28,0.0,0.0,0.0,0.0,...,0.0,0.0,0.383432,0.0,0.0,0.0,0.051327,-0.127917,-0.052078,0.012366
4,5,193,30,7,41,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.083067,-0.123744,5e-06,-0.144462


In [16]:
# Step 2: Split the data into training and validation sets

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Validation target shape: {y_val.shape}")

Training features shape: (1440000, 110)
Validation features shape: (360000, 110)
Training target shape: (1440000,)
Validation target shape: (360000,)


In [17]:
# Step 3: Normalize and standardize the numerical features

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and validation data
# The scaler learns the mean and standard deviation from the training data only
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Convert the scaled data back to a DataFrame for easier inspection
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X.columns)

print("Scaled training features (first 5 rows):")
print(X_train_scaled_df.head())

print("\nScaled validation features (first 5 rows):")
print(X_val_scaled_df.head())

Scaled training features (first 5 rows):
     Rating  Review_str_len  Title_str_len  title_wtoken_cnt  \
0  1.224796       -0.037667       0.092816         -0.160130   
1 -1.225064        0.345768      -0.685000         -0.533372   
2 -1.225064       -0.459446      -1.392106         -1.279855   
3 -0.000134        0.648256       1.224186          1.332836   
4 -1.225064       -0.796017      -0.755711         -0.906613   

   Review_wtoken_cnt  review_removed_cnt  actually     album    author  \
0          -0.112532            0.092511 -0.179722 -0.215662 -0.188686   
1           0.301066            0.516678  3.735406 -0.215662 -0.188686   
2          -0.365287           -0.808842 -0.179722 -0.215662 -0.188686   
3           0.415954            0.728761 -0.179722 -0.215662  7.363296   
4          -0.916751           -0.808842 -0.179722 -0.215662 -0.188686   

        bad  ...      work     works     worth   written      year     years  \
0 -0.232802  ... -0.274708 -0.193677 -0.211024 -0