In [295]:
import pandas as pd
import numpy as np
import pycountry


# Country table

In [296]:
# Get a list of all countries
countries_short = [country.alpha_2 for country in pycountry.countries]
countries_name = [country.name for country in pycountry.countries]

countries_df = pd.DataFrame({'handle': countries_short, 'name': countries_name}).reset_index()
countries_df = countries_df.rename(columns={"index":"country_id"})

# Movies Dataset cleaning

In [336]:
movies = pd.read_csv(r"C:\Users\Martijn\Downloads\netflix_titles_anandshaw.csv")
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [337]:
# How many rows with including the words "alien" and "ufo" in the title or description? 
filtered_df = movies[movies['title'].str.contains('alien|ufo|extraterrestrial|spaceship|spacecraft|cosmic|intergalactic|martian|extraterrestrials|galactic|asteroid|space|starship', case=False, na=False) | 
                     movies['description'].str.contains('alien|ufo|extraterrestrial|spaceship|spacecraft|cosmic|intergalactic|martian|extraterrestrials|galactic|asteroid|space|starship', case=False, na=False)]
print(len(filtered_df))

# create new column that indicates if movie has ufo theme
movies['ufo_theme'] = np.where(
    movies['description'].str.contains(
        'alien|ufo|extraterrestrial|spaceship|spacecraft|cosmic|intergalactic|martian|extraterrestrials|galactic|asteroid|space|starship', 
        case=False, 
        na=False
    ), 
    'yes', 
    'no'
)

movies.columns

174


Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'ufo_theme'],
      dtype='object')

In [None]:
movies['date_added'] = pd.to_datetime(movies['date_added'], errors='coerce')
# Drop rows where 'date_added' is NaT due to coercion errors
movies = movies.dropna(subset=['date_added'])
movies = movies[['date_added', 'ufo_theme', 'release_year', 'type', 'title']]
movies['date_added_formatted'] = movies['date_added'].dt.strftime('%m/%Y')

# UFO reports cleaning

In [300]:
ufo_report  = pd.read_csv(r"C:\Users\Martijn\Downloads\nuforc_reports (1).csv")

In [302]:
ufo_report.info()

# rename columns to lower case
ufo_report.columns = ufo_report.columns.str.lower()

# remove columns 'Date', 'Posted', 'Shape', 'Duration', 'Image', 'Link', 'Summary', 'Text' (to avoid confusion about correct date column and other columns)
ufo_report = ufo_report.drop(columns=['date','posted', 'shape', 'duration', 'image', 'link', 'summary', 'text'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143940 entries, 0 to 143939
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Date_Table  143940 non-null  object
 1   Date        143940 non-null  object
 2   Posted      143940 non-null  object
 3   City        143933 non-null  object
 4   State       143895 non-null  object
 5   Country     143938 non-null  object
 6   Shape       143940 non-null  object
 7   Duration    143915 non-null  object
 8   Image       143940 non-null  object
 9   Link        143940 non-null  object
 10  Summary     143940 non-null  object
 11  Text        143906 non-null  object
dtypes: object(12)
memory usage: 13.2+ MB


In [303]:
# rename column date_table
ufo_report = ufo_report.rename(columns={'date_table': 'date'})
print(ufo_report)


                             date                city    state  country
0                         04/2023            Honolulu       HI      USA
1                         04/2023         Bakersfield  Unknown      USA
2                         04/2023         Castle Dale  Unknown  Unknown
3                         04/2023           Baltimore       MD      USA
4                         03/2023             Madison       WI      USA
...                           ...                 ...      ...      ...
143935                    07/1947             Roswell       NM      USA
143936                    06/1947      Corpus Christi       TX      USA
143937                    06/1952              Auburn       WA      USA
143938  UNSPECIFIED / APPROXIMATE           Troutdale       OR      USA
143939                    06/1950  Budapest (Hungary)  Unknown  Hungary

[143940 rows x 4 columns]


In [304]:
correct_format = r'^\d{1,2}/\d{4}$'

# count rows that do not have date format of MM/YYYY in 'Date_Table'
invalid_rows_count = ufo_report[~ufo_report['date'].str.match(correct_format, na=False)].shape[0]

print(f"invalid rows: {invalid_rows_count}")

# remove rows with invalid date format 
ufo_report = ufo_report[ufo_report['date'].str.match(correct_format, na=False)]
print(ufo_report)

invalid rows: 376
           date                city    state  country
0       04/2023            Honolulu       HI      USA
1       04/2023         Bakersfield  Unknown      USA
2       04/2023         Castle Dale  Unknown  Unknown
3       04/2023           Baltimore       MD      USA
4       03/2023             Madison       WI      USA
...         ...                 ...      ...      ...
143934  07/1954             Oakdale       NY      USA
143935  07/1947             Roswell       NM      USA
143936  06/1947      Corpus Christi       TX      USA
143937  06/1952              Auburn       WA      USA
143939  06/1950  Budapest (Hungary)  Unknown  Hungary

[143564 rows x 4 columns]


In [305]:
# based on unique values from country we clean country column and remove invalid values
unique_countries = ufo_report['country'].unique()
print(unique_countries)

# List of invalid values to remove
invalid_countries = ['no', 'unknown', 'none', 'not applicable', 'unknown/at sea', 'unavailable', 'in orbit', 'space', 'atlantic ocean', 'caribbean sea', 'pacific ocean', 'international space station', 'moon', 'mars', 'none', 'not found']

# Convert the 'Country' column to lowercase and filter out invalid values
ufo_report = ufo_report[~ufo_report['country'].str.lower().isin(invalid_countries)]

['USA' 'Unknown' 'United Kingdom' 'Canada' 'turkey' 'India' 'Australia'
 'Malta' 'Switzerland' 'France' 'Guam' 'Puerto Rico' 'Ukraine' 'Mexico'
 'Ireland' 'Japan' 'Germany' 'Poland' 'New Zealand' 'Pakistan'
 'South Africa' 'Papua New Guinea' 'Kenya' 'Thailand' 'Israel' 'Denmark'
 'Malaysia' 'Lebanon (Middle East)' 'Argentina' 'Myanmar' 'Cambodia'
 'Croatia' 'On the way to Cozumel' 'South-Africa' 'Brazil' 'Macedonia'
 'Cyprus' 'China' 'Panama' 'Romania' 'In Orbit' 'Trinidad/Tobago'
 'Jamaica' 'Luxemburg' 'Italy' 'East China Sea' 'Bulgaria' 'Iran' 'Spain'
 'Decalb' 'Portugal' 'Turkey' 'Indonesia' 'Lebanon' 'South Korea'
 'North Wales' 'Belgium' 'Netherlands' 'Luxembourg' 'Philippines'
 'Guatemala' 'Srui Lanka' 'Jordan' 'Afghanistan' 'Finland' 'Taiwan'
 'Algeria' 'Venezuela' 'Bahamas'
 'South Georgia and the south sandwich islands' 'Costa Rica' 'Honduras'
 'Lithuania' 'Bahrain' 'Palau' 'Western Australia' 'Mozambique'
 'Dominican Republic' 'Belize' 'Slovenia' 'Bosnia and Herzegovina'
 'Co

In [306]:
# create Country_Code column
# Funktion zum Abrufen des ISO 3166-1 Alpha-2 Ländercodes
def get_country_code(country_name):
    if isinstance(country_name, str):  # Prüfen, ob es eine Zeichenkette ist
        country = pycountry.countries.get(name=country_name)
        return country.alpha_2 if country else None
    return None  # Falls der Wert kein String ist (z. B. NaN)

# Beispiel-Datenframe erstellen (falls noch nicht vorhanden)
# ufo_report = pd.read_csv("deine_datei.csv")  # Falls Daten aus einer CSV geladen werden

# Neue Spalte "Country_Code" erstellen
ufo_report['country_code'] = ufo_report['country'].apply(lambda x: get_country_code(x) 
    if isinstance(x, str) and x.lower() not in ['non applicable', 'unknown', 'in orbit', 'at sea'] else None)

# Zeilen mit fehlendem "Country_Code" entfernen
ufo_report = ufo_report.dropna(subset=['country_code'])

In [None]:
# Merge ufo_report with countries_df to get the corresponding country_id
ufo_report = ufo_report.merge(countries_df[['handle', 'country_id']], how='left', left_on='country_code', right_on='handle')

# Replace the country_code column with the country_id column
ufo_report['country_code'] = ufo_report['country_id']

# Drop unnecessary columns
ufo_report.drop(columns=['handle', 'country_id', "country"], inplace=True)
ufo_report = ufo_report.rename(columns={"country_code":"country_id"})

Unnamed: 0,date,city,state,country_id
0,06/2021,Felpham/Bognor Regis (UK/England),Unknown,79
1,10/2014,Swindon (UK/England),Unknown,79
2,04/2020,Brocket (Canada),AB,39
3,08/2013,Lincoln (Lincolnshire) (UK/England),Unknown,79
4,08/2014,Port Coquitlam (Canada),BC,39
...,...,...,...,...
14581,01/1958,Birchy Bay (Canada),NF,39
14582,04/1957,Lahore,Unknown,172
14583,10/1955,Chester (UK/England),Unknown,79
14584,10/1952,Fukuoka (Japan),Unknown,115


# Subscribers Cleaning

In [308]:
subscribers = pd.read_csv(r"C:\Users\Martijn\Downloads\subscribers_netflix_2024 (1).csv")

In [309]:
subscribers = subscribers.rename(columns={'estimated_subscribers': 'subscribers'}).reset_index(drop=True)


In [310]:
# Merge ufo_report with countries_df to get the corresponding country_id
subscribers = subscribers.merge(countries_df[['handle', 'country_id']], how='left', left_on='country_code', right_on='handle')

# Replace the country_code column with the country_id column
subscribers['country_code'] = subscribers['country_id']

# Drop unnecessary columns
subscribers.drop(columns=['handle', 'country_id', "country"], inplace=True)
subscribers = subscribers.rename(columns={"country_code":"country_id"})