In [15]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [17]:
# Load dataset
data = pd.read_csv('movie.csv')

In [22]:
data.columns

Index(['text', 'label', 'cleaned_text'], dtype='object')

### Text Preprocessing

In [16]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [19]:
# Apply text preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

In [21]:
# Convert text data into numerical data using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['cleaned_text']).toarray()

# Encode the labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now X_train, X_test, y_train, and y_test are ready for model training and evaluation
print("Preprocessing complete.")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Preprocessing complete.
Shape of X_train: (32000, 5000)
Shape of X_test: (8000, 5000)
Shape of y_train: (32000,)
Shape of y_test: (8000,)


In [23]:
#import pickle

In [24]:
# # Save the processed data using pickle
# with open('X_train.pkl', 'wb') as f:
#     pickle.dump(X_train, f)

# with open('X_test.pkl', 'wb') as f:
#     pickle.dump(X_test, f)

# with open('y_train.pkl', 'wb') as f:
#     pickle.dump(y_train, f)

# with open('y_test.pkl', 'wb') as f:
#     pickle.dump(y_test, f)

# print("Preprocessing complete and data saved.")

Preprocessing complete and data saved.
