In [24]:
#  64c70f22564c1c22f2430afc15b3c64a   API Key

### Fetching Data of Movies from TMDB Website

### Task 1: Preparation of Dataset

In [38]:
# importing libraries
import requests
import pandas as pd

# API Key requested from TMDB website
API_KEY = '64c70f22564c1c22f2430afc15b3c64a'
LANGUAGE = 'en-US'

# Function to fetch movie data from TMDB API
def fetch_movie_data(page):
    url = f'https://api.themoviedb.org/3/movie/top_rated'
    params = {
        'api_key': API_KEY,
        'language': LANGUAGE,
        'page': page
    }
    response = requests.get(url, params=params)
    return response.json()

# Function to fetch genre data from TMDB API
def fetch_genre_data():
    url = 'https://api.themoviedb.org/3/genre/movie/list'
    params = {
        'api_key': API_KEY,
        'language': LANGUAGE
    }
    response = requests.get(url, params=params)
    return response.json()

genre_data = fetch_genre_data()
genre_mapping = {genre['id']: genre['name'] for genre in genre_data['genres']}

# Create an empty dataframe to store movie data
#Then will convert this Dataframe to CSV file
movie_df = pd.DataFrame(columns=['ID', 'Movie Name', 'Overview', 'Genre'])

# Loop through all 471 pages and fetch movie data

for page in range(1, 472):
    movie_data = fetch_movie_data(page)
    for movie in movie_data['results']:
        movie_id = movie['id']
        movie_name = movie['title']
        overview = movie['overview']
        genre_ids = movie['genre_ids']
        genres = [genre_mapping.get(genre_id, 'Unknown') for genre_id in genre_ids]

        movie_df = movie_df.append({'ID': movie_id, 'Movie Name': movie_name, 'Overview': overview, 'Genre': ', '.join(genres)}, ignore_index=True)

# Save the dataset to a CSV file
movie_df.to_csv('Movies.csv', index=False)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  movie_df = movie_df.append({'ID': movie_id, 'Movie Name': movie_name, 'Overview': overview, 'Genre': ', '.join(genres)}, ignore_index=True)
  movie_df = movie_df.append({'ID': movie_id, 'Movie Name': movie_name, 'Overview': overview, 'Genre': ', '.join(genres)}, ignore_index=True)
  movie_df = movie_df.append({'ID': movie_id, 'Movie Name': movie_name, 'Overview': overview, 'Genre': ', '.join(genres)}, ignore_index=True)
  movie_df = movie_df.append({'ID': movie_id, 'Movie Name': movie_name, 'Overview': overview, 'Genre': ', '.join(genres)}, ignore_index=True)
  movie_df = movie_df.append({'ID': movie_id, 'Movie Name': movie_name, 'Overview': overview, 'Genre': ', '.join(genres)}, ignore_index=True)
  movie_df = movie_df.append({'ID': movie_id, 'Movie Name': movie_name, 'Overview': overview, 'Genre': ', '.join(genres)}, ignore_index=True)
  movie_df = movie_df.append({'ID': movie_id, 'Movie Name': movie_name, 'Overview':

In [39]:
movie_df.head(10)
# Display the first 10 rows of the Dataframe generated
# Csv file of this Dataframe is generated in the above code


Unnamed: 0,ID,Movie Name,Overview,Genre
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama, Crime"
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy, Drama, Romance"
5,389,12 Angry Men,The defense and the prosecution have rested an...,Drama
6,129,Spirited Away,"A young girl, Chihiro, becomes trapped in a st...","Animation, Family, Fantasy"
7,496243,Parasite,"All unemployed, Ki-taek's family takes peculia...","Comedy, Thriller, Drama"
8,372058,Your Name.,High schoolers Mitsuha and Taki are complete s...,"Romance, Animation, Drama"
9,155,The Dark Knight,Batman raises the stakes in his war on crime. ...,"Drama, Action, Crime, Thriller"


### Task 2
### Data Cleaning



In [40]:
# importing libraries for Data Cleaning
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [41]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [42]:
# Load the dataset
movie_df = pd.read_csv('/content/Movies.csv')

### Check for Missing values

In [43]:
# Check for missing values
missing_values = movie_df.isnull().sum()
print("Missing Values in the Entire DataFrame:")
print(missing_values)

Missing Values in the Entire DataFrame:
ID            0
Movie Name    0
Overview      1
Genre         6
dtype: int64


### Checking for Duplicate Rows

In [44]:
# Checking for duplicate rows in the entire DataFrame
duplicate_rows = movie_df.duplicated().sum()
print("Duplicate Rows in the DataFrame:", duplicate_rows)


Duplicate Rows in the DataFrame: 0


### Data Cleaning by removing special characters and whitespaces


In [45]:
# Import the regular expression library
import re

# Define a function to remove special characters and extra whitespace
def clean_text(text):
    try:
        # Check if the input is a valid string (not NaN or None)
        if isinstance(text, str):
            # Remove special characters and extra whitespace
            cleaned_text = re.sub(r'[^\w\s]', '', text)
            cleaned_text = ' '.join(cleaned_text.split())
            return cleaned_text
        else:
            # If the input is not a valid string, return an empty string
            return ''
    except Exception as e:
        # Handle any exceptions (e.g., invalid regular expression)
        print(f"An error occurred: {str(e)}")
        return ''

# Apply the clean_text function to the "Overview" column
movie_df['Overview'] = movie_df['Overview'].apply(clean_text)


### Handling Categorical Columns

In [46]:
### One Hot Encoding from Genre Column
# movie_df = pd.get_dummies(movie_df, columns=['Genre'], prefix=['Genre'])
### THis is done for Training and Feature Engineering usually, so I am not gonna run this right now


In [47]:
movie_df.head(5)

Unnamed: 0,ID,Movie Name,Overview,Genre
0,238,The Godfather,Spanning the years 1945 to 1955 a chronicle of...,"Drama, Crime"
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama, Crime"
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,19404,Dilwale Dulhania Le Jayenge,Raj is a rich carefree happygolucky second gen...,"Comedy, Drama, Romance"




*   Tokenization
*   Stemming
*   Lemmatization



   



### Tokenization

In [48]:
# Function for Tokenization
# Define functions for text preprocessing
def preprocess_text(text):
    try:
        # Tokenization
        tokens = word_tokenize(text)

        # Remove punctuation and convert to lowercase
        tokens = [word.lower() for word in tokens if word.isalpha()]

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        return ' '.join(tokens)
    except:
        return ''

### Stemming

In [49]:
# Function for Stemming
def stem_text(text):
    try:
        # Stemming using Porter Stemmer
        stemmer = PorterStemmer()
        tokens = word_tokenize(text)
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        return ' '.join(stemmed_tokens)
    except:
        return ''

### Lemmatization

In [50]:
# Function for Lemmatization
def lemmatize_text(text):
    try:
        # Lemmatization using WordNet Lemmatizer
        lemmatizer = WordNetLemmatizer()
        tokens = word_tokenize(text)
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(lemmatized_tokens)
    except:
      # Handle exceptions (missing values or non-string data)
        return ''

In [51]:
# text preprocessing functions to the "Overview" column

# preprocessing function to clean the text and remove stopwords
movie_df['Overview'] = movie_df['Overview'].apply(preprocess_text)

# preprocessing function to clean the text and remove stopwords
movie_df['Overview_Stemmed'] = movie_df['Overview'].apply(stem_text)

# lemmatization to the cleaned text
movie_df['Overview_Lemmatized'] = movie_df['Overview'].apply(lemmatize_text)

# Saving the preprocessed dataset
movie_df.to_csv('Movies_Preprocessed.csv', index=False)
### Movies_preprocessed.csv file contains the cleaned data, and can be used for training a model.
## Can apply other techniques to process the Dataset

In [52]:
### Display the Dataframe after all the Data Preprocessing and saving it to the CSV File
movie_df.head(10)

Unnamed: 0,ID,Movie Name,Overview,Genre,Overview_Stemmed,Overview_Lemmatized
0,238,The Godfather,spanning years chronicle fictional italianamer...,"Drama, Crime",span year chronicl fiction italianamerican cor...,spanning year chronicle fictional italianameri...
1,278,The Shawshank Redemption,framed double murder wife lover upstanding ban...,"Drama, Crime",frame doubl murder wife lover upstand banker a...,framed double murder wife lover upstanding ban...
2,240,The Godfather Part II,continuing saga corleone crime family young vi...,"Drama, Crime",continu saga corleon crime famili young vito c...,continuing saga corleone crime family young vi...
3,424,Schindler's List,true story businessman oskar schindler saved t...,"Drama, History, War",true stori businessman oskar schindler save th...,true story businessman oskar schindler saved t...
4,19404,Dilwale Dulhania Le Jayenge,raj rich carefree happygolucky second generati...,"Comedy, Drama, Romance",raj rich carefre happygolucki second gener nri...,raj rich carefree happygolucky second generati...
5,389,12 Angry Men,defense prosecution rested jury filing jury ro...,Drama,defens prosecut rest juri file juri room decid...,defense prosecution rested jury filing jury ro...
6,129,Spirited Away,young girl chihiro becomes trapped strange new...,"Animation, Family, Fantasy",young girl chihiro becom trap strang new world...,young girl chihiro becomes trapped strange new...
7,496243,Parasite,unemployed kitaeks family takes peculiar inter...,"Comedy, Thriller, Drama",unemploy kitaek famili take peculiar interest ...,unemployed kitaeks family take peculiar intere...
8,372058,Your Name.,high schoolers mitsuha taki complete strangers...,"Romance, Animation, Drama",high schooler mitsuha taki complet stranger li...,high schoolers mitsuha taki complete stranger ...
9,155,The Dark Knight,batman raises stakes war crime help lt jim gor...,"Drama, Action, Crime, Thriller",batman rais stake war crime help lt jim gordon...,batman raise stake war crime help lt jim gordo...
