In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
# Load the dataset
df = pd.read_csv(r'C:\Users\annsu\OneDrive\Desktop\AIDI\AIDI Semester2\AI in Enterprise Systems\movie.csv')

In [5]:
# Display the first few rows of the dataset to verify the structure
print(df.head())

                                                text  label
0  I grew up (b. 1965) watching and loving the Th...      0
1  When I put this movie in my DVD player, and sa...      0
2  Why do people who do not know what a particula...      0
3  Even though I have great interest in Biblical ...      0
4  Im a die hard Dads Army fan and nothing will e...      1


In [6]:
# Check for NaN values in the dataset
print("Initial NaN values:\n", df.isnull().sum())

Initial NaN values:
 text     0
label    0
dtype: int64


In [7]:
# Drop rows with NaN values
df.dropna(inplace=True)

In [8]:
# Confirm no NaN values
print("After dropping NaN values:\n", df.isnull().sum())

After dropping NaN values:
 text     0
label    0
dtype: int64


In [9]:
# Ensure the dataset has the correct columns
print("Columns in the dataset:", df.columns)

Columns in the dataset: Index(['text', 'label'], dtype='object')


In [10]:
# Map the 'label' values to numerical values (assuming 'positive' and 'negative')
df['label'] = df['label'].map({'positive': 1, 'negative': 0})

In [11]:
# Check for NaN values in the 'label' column after mapping
print("NaN values in 'label' after mapping:\n", df['label'].isnull().sum())

# Drop any rows where 'label' is NaN after mapping
df = df.dropna(subset=['label'])

# Confirm no NaN values in the 'label' column
print("NaN values in 'label' after dropping NaNs:\n", df['label'].isnull().sum())

NaN values in 'label' after mapping:
 40000
NaN values in 'label' after dropping NaNs:
 0


In [12]:
# Verify the number of samples in the dataset
print("Number of samples in the dataset:", len(df))

Number of samples in the dataset: 0


In [13]:
# Define a function to print evaluation metrics
def print_evaluation_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

In [14]:
# Proceed with splitting the data if there are enough samples
if len(df) > 0:
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
    
    # Vectorize the text data
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    # Train Logistic Regression model
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train_tfidf, y_train)
    
    # Evaluate the model
    y_pred_lr = lr_model.predict(X_test_tfidf)
    
    # Evaluate Logistic Regression
    print("Logistic Regression:")
    print_evaluation_metrics(y_test, y_pred_lr)
    
    # Train Support Vector Machine model
    svm_model = SVC()
    svm_model.fit(X_train_tfidf, y_train)
    
    # Evaluate the model
    y_pred_svm = svm_model.predict(X_test_tfidf)
    
    # Evaluate SVM
    print("Support Vector Machine:")
    print_evaluation_metrics(y_test, y_pred_svm)
    
    # Train Random Forest model
    rf_model = RandomForestClassifier()
    rf_model.fit(X_train_tfidf, y_train)
     # Evaluate the model
    y_pred_rf = rf_model.predict(X_test_tfidf)
    
    # Evaluate Random Forest
    print("Random Forest:")
    print_evaluation_metrics(y_test, y_pred_rf)

else:
    print("The dataset is empty after preprocessing. Please check the dataset and preprocessing steps.")

The dataset is empty after preprocessing. Please check the dataset and preprocessing steps.
