<a href="https://colab.research.google.com/github/meenabm07/codesoft_02/blob/main/codesoft_02_MOVIE_RATING_PREDICTION_USING_PYTHON_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

. IMPORT LIBRARIES AND LOAD DATA

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset with specified encoding
file_path = '/content/IMDb Movies India.csv'  # Update this with the actual path
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the first few rows of the dataset
print(df.head())

# Display basic information about the dataset
print(df.info())

# Check for missing values
print(df.isnull().sum())


                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

 Check for NaN Values in Predictions

In [None]:
print("NaNs in y_test:", y_test.isnull().sum())
print("NaNs in y_pred:", pd.isna(y_pred).sum())


NaNs in y_test: 1
NaNs in y_pred: 0


Investigate and Handle NaN Values

In [None]:
print("NaNs in X_train:", X_train.isnull().sum().sum())
print("NaNs in X_test:", X_test.isnull().sum().sum())
print("NaNs in y_train:", y_train.isnull().sum())
print("NaNs in y_test:", y_test.isnull().sum())

NaNs in X_train: 24814
NaNs in X_test: 6205
NaNs in y_train: 0
NaNs in y_test: 1


In [None]:
# Remove rows where the target variable 'rating' is NaN
df = df.dropna(subset=['Rating'])

# Re-define features and target variable after removing NaNs
X = df.drop(['Rating'], axis=1)
y = df['Rating']


DATA PREPROCESSING

In [None]:
# Convert numerical columns to appropriate data types
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Remove rows with NaN in target variable
df = df.dropna(subset=['Rating'])

# Handle missing values (if any)
df.fillna(method='ffill', inplace=True)


# Define features and target variable
X = df.drop(['Rating'], axis=1)
y = df['Rating']

# Ensure no NaN values in features or target
print("NaNs in X:", X.isnull().sum().sum())
print("NaNs in y:", y.isnull().sum())

# Categorical and numerical columns
categorical_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
numerical_cols = ['Year', 'Duration', 'Votes']

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NaNs in X: 15838
NaNs in y: 0


MODEL TRAINING AND EVALUATION

In [None]:
# Define and train the model
model = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearRegression())])
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 9.12066869719769
Mean Absolute Error: 2.004223201616175
R^2 Score: -3.9058400540183262
