# **Read and merge datasets**

In [154]:
import pandas as pd

movies = pd.read_csv('/content/drive/MyDrive/tmdb_5000_movies.csv')

credits = pd.read_csv('/content/drive/MyDrive/tmdb_5000_credits.csv')


In [None]:
movies.info()

In [None]:
credits.info()

In [None]:
credits.head(5)

In [155]:
credits.columns = ['id','title','cast','crew']
movies= movies.merge(credits,on='id')

Note1. In this part to have a comprehensive dataset two CSV files were merged

# **Data cleaning**

In [None]:
movies.info()

# **Handle Null and doplicate rows**



In [156]:
movies.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title_x                    0
vote_average               0
vote_count                 0
title_y                    0
cast                       0
crew                       0
dtype: int64

In [157]:
movies['homepage'].fillna('No Homepage', inplace=True)
movies['tagline'].fillna('No Tagline', inplace=True)

In [158]:
movies.dropna(inplace=True)

In [159]:
movies.drop_duplicates(inplace=True)

In [146]:
movies.isnull().sum()

budget                  0
genres                  0
homepage                0
id                      0
keywords                0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
tagline                 0
title_x                 0
vote_average            0
vote_count              0
title_y                 0
cast                    0
crew                    0
dtype: int64

Note2. Two features, the homepage (more than 3000 lines) and tagline (more than 800) had a large number of nulls compared to the total number of lines (4802).
In this situation, it is not possible to easily delete rows with null values.
There are different methods such as Text Analysis, Imputation with a Placeholder, Feature Engineering, and Modeling with Missing Values to handle the missing values which can be chosen according to the goal of the projects.
Since the goal of this project is to find out whether a film was profitable or not, and considering having two columns (budget and revenue) we can delete columns with missing values in the next steps.
As a result, in the stage of data cleaning, I applied the " Imputation with a Placeholder" method and filled the null values in these two columns with "No Homepage" and "No Tagline".

Moreover, there are a few numbers of null values in other columns which I drop them

# **Handle Missing and outliers values**

In [None]:
movies.info()

In [160]:
df = movies

In [161]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4799 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4799 non-null   int64  
 1   genres                4799 non-null   object 
 2   homepage              4799 non-null   object 
 3   id                    4799 non-null   int64  
 4   keywords              4799 non-null   object 
 5   original_language     4799 non-null   object 
 6   original_title        4799 non-null   object 
 7   overview              4799 non-null   object 
 8   popularity            4799 non-null   float64
 9   production_companies  4799 non-null   object 
 10  production_countries  4799 non-null   object 
 11  release_date          4799 non-null   object 
 12  revenue               4799 non-null   int64  
 13  runtime               4799 non-null   float64
 14  spoken_languages      4799 non-null   object 
 15  status               

In [162]:
# Iterate over each cell in the DataFrame
for index, row in df.iterrows():
    for column in df.columns:
        # Check if the cell contains an empty list []
        if row[column] == "[]":
            # Handle empty list based on your specific needs
            # For example, replace it with NaN
            df.at[index, column] = float('nan')  # or np.nan if numpy is imported

In [163]:
df.isnull().sum()

budget                    0
genres                   27
homepage                  0
id                        0
keywords                410
original_language         0
original_title            0
overview                  0
popularity                0
production_companies    349
production_countries    172
release_date              0
revenue                   0
runtime                   0
spoken_languages         84
status                    0
tagline                   0
title_x                   0
vote_average              0
vote_count                0
title_y                   0
cast                     41
crew                     27
dtype: int64

In [None]:
unique_values = df['production_countries'].unique()
print(unique_values)

In [164]:
df.dropna(subset=['genres', 'cast', 'crew', 'spoken_languages','keywords', 'production_companies','production_countries'], inplace=True)

In [166]:
df.isnull().sum()

budget                  0
genres                  0
homepage                0
id                      0
keywords                0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
tagline                 0
title_x                 0
vote_average            0
vote_count              0
title_y                 0
cast                    0
crew                    0
dtype: int64

In the following, the features of['genres', 'cast', 'crew', 'spoken_languages','keywords', 'production_companies','production_countries'] had missing values, which I first converted to null and then deleted.

In [88]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Data preprocessing
# Convert categorical variables like 'original_language' to numerical format using one-hot encoding
df = pd.get_dummies(df, columns=['original_language'])

# Feature selection
selected_features = ['budget', 'id', 'vote_count', 'original_language_en']  # Add more features as needed

# Split data into features and target variable
X = df[selected_features].dropna()  # Drop rows with missing values
y = df.loc[X.index]['genres'].apply(lambda x: 0 if isinstance(x, float) else 1)  # 1 for non-null, 0 for null

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict genres for missing values
predicted_genres = model.predict(X_test)

# Add predicted genres to the original dataset
df.loc[X_test.index, 'genres'] = predicted_genres

# Evaluate the model (optional)
accuracy = accuracy_score(y_test, model.predict(X_test))
print("Accuracy:", accuracy)

# Save the updated dataset with predicted genres
df.to_csv('updated_dataset.csv', index=False)





Accuracy: 0.9958333333333333


In [None]:
# Extract unique genres
unique_genres = set()
for genres_list in df['production_countries']:
    print(genres_list)
    genres_list = eval(genres_list)
    for genre in genres_list:
        unique_genres.add(genre['name'])

# Create binary features for each unique genre
for genre in unique_genres:
    df[genre] = df['production_countries'].apply(lambda x: 1 if genre in x else 0)

# Drop the original 'genres' column
df.drop(columns=['production_countries'], inplace=True)


In [None]:
df.info()