In [1]:
!pip install pandas
!pip install seaborn

Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting numpy>=1.22.4 (from pandas)
  Downloading numpy-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, numpy, pandas
Successfully installed numpy-2.1.0 pandas-2.2.2 tzdata-2024.1
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [2]:
import pandas as pd

In [3]:
def split_and_create_columns(df, column_name):
    df[column_name] = df[column_name].astype('string')
    #fill with UnKnown all NaN
    df[column_name] = df[column_name].fillna('UnKnown')
    # Split the specified column by comma and create a set of all unique values
    all_values = []
    for row in df[column_name]:
      if row is not None:
        values=row.split(',')
        for val in values:
          if val not in all_values:
            all_values.append(val)

    # Create new columns for each unique value, initialized with False
    for value in all_values:
        df[column_name+"_"+value.strip()] = False

    # Iterate through each row and set the appropriate columns to True
    for index, row in df.iterrows():
        values = row[column_name].split(',')
        for value in values:
            df.at[index, column_name+"_"+value.strip()] = True
    df.drop(column_name, axis=1, inplace=True)
    return df

In [4]:
org_df = pd.read_csv('TMDB_tv_dataset_v3.csv', encoding='utf-8')

In [5]:
df = org_df.copy()

In [6]:
#Convert the columns to date base on the content of list_date_columns
list_date_columns = ['first_air_date', 'last_air_date']
for column in list_date_columns:
    df[column] = pd.to_datetime(df[column], errors='coerce')

#remove all rows that first_air_date is older that 2015
df = df[df['first_air_date'] >= '2015-01-01']
df = df.reset_index(drop=True)

#remove all rows that first_air_date is after 2024-01-01
df = df[df['first_air_date'] <= '2024-01-01']
df = df.reset_index(drop=True)

In [7]:
#get all columns that has NaN more than 66% of the total df length
nan_columns = df.columns[df.isna().mean() > 0.66]
df = df.drop(nan_columns, axis=1) 

In [8]:
#remove backdrop_path column , not relevant
df = df.drop(['backdrop_path'], axis=1)

In [9]:
#If last_air_date is NaN , put todays date in it
df['last_air_date'] = df['last_air_date'].fillna(pd.to_datetime('today'))

In [10]:
#remove poster_path , not relevant
df = df.drop(['poster_path'], axis=1)

In [11]:
#Convert the columns to string base on the content of list_columns
list_columns = ['name','original_language','original_name','languages','networks','origin_country','spoken_languages','genres','overview','homepage','production_companies','production_countries']
for column in list_columns:
    df[column] = df[column].astype('string')

In [12]:
#Convert the columns to category base on the content of list_category_columns
list_category_columns = ['original_language','adult','status','in_production','type']
for column in list_category_columns:
    df[column] = df[column].astype('category')

In [13]:
#Convert the columns to int base on the content of list_int_columns
list_int_columns = ['number_of_seasons','number_of_episodes','vote_count']
for column in list_int_columns:
    df[column] = df[column].astype(int)

#Convert the columns to float base on the content of list_float_columns
list_float_columns = ['vote_average', 'popularity']
for column in list_float_columns:
    df[column] = df[column].astype(float)

In [14]:
#split by comma and create bool columns
df = split_and_create_columns(df, 'genres')

In [15]:
#convert all bool type columns to category
for column in df.columns:
    if df[column].dtype == 'bool':
        df[column] = df[column].astype('category')

In [16]:
#Take only the top 10 Networks from columns networks
cols = df['networks'].value_counts()[:10].index
df['networks'] = df['networks'].apply(lambda x: x if x in cols else 'Other')
  

In [17]:
#convert networks to category
df['networks'] = df['networks'].astype('category')

In [18]:
#remove from homepage the sctings: http:// , https://
df['homepage'] = df['homepage'].str.replace('http://', '')
df['homepage'] = df['homepage'].str.replace('https://', '')
#split homepage by "/" anf get only the first element
df['homepage'] = df['homepage'].str.split('/').str[0]

In [19]:
#Take only the top 10 homepage from columns homepage
#take the top 10
cols = df['homepage'].value_counts()[:10].index
df['homepage'] = df['homepage'].apply(lambda x: x if x in cols else 'Other')
df['homepage'] = df['homepage'].astype('category')

In [20]:
#drop the column production_companies
df = df.drop(['production_companies'], axis=1)

In [87]:
#Create new string base DF base on the columns: overview,homepage,original_name,languages,spoken_languages,production_countries
#df_string = df[['overview', 'homepage', 'original_name', 'languages', 'spoken_languages', 'production_countries']]
#df.drop(['overview', 'homepage', 'original_name', 'languages', 'spoken_languages', 'production_countries'], axis=1, inplace=True)

In [21]:
df.to_csv('tv_flat_file.csv')
df.to_pickle('tv_flat_file.pkl')
#df_string.to_csv('tv_flat_file_str.csv')
#df_string.to_pickle('tv_flat_file_str.pkl')

In [153]:
#add column total_brodcast_months 
#df['total_broadcast_months'] = (df['last_air_date'] - df['first_air_date']).dt.days / 30
#df.drop(['first_air_date','last_air_date'], axis=1, inplace=True)

In [154]:
#remove name - not relevant for prediction
#df.drop(['name'], axis=1, inplace=True)

In [155]:
#convert all categories to number
#df = df.apply(lambda x: x.cat.codes if x.dtype.name == 'category' else x)