Import libs and funcs

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

class EDA:
    def __init__(self, data=None, file_path=None, file_type='csv'):
        if data is not None:
            self.data = data
        elif file_path is not None:
            self.data = self.load_data(file_path, file_type)
        else:
            self.data = pd.DataFrame()
    
    def load_data(self, file_path, file_type='csv'):
        """
        Load data from a file.
        
        Parameters:
            file_path (str): Path to the data file.
            file_type (str): Type of the file ('csv', 'excel', 'json', etc.).
        
        Returns:
            pandas.DataFrame: Loaded data.
        """
        if file_type == 'csv':
            return pd.read_csv(file_path)
        elif file_type == 'tsv':
            return pd.read_csv(file_path, sep='\t')
        elif file_type == 'excel':
            return pd.read_excel(file_path)
        elif file_type == 'json':
            return pd.read_json(file_path)
        else:
            raise ValueError("Unsupported file type.")
    
    def data_types(self):
        """
        Display data types of each column.
        """
        print(self.data.dtypes)
    
    def missing_values(self):
        """
        Summarize missing values in the dataset, showing both count and percentage.
        """
        # Calculate the number of missing values per column
        missing = self.data.isnull().sum()
        
        # Calculate the total number of entries per column
        total = self.data.shape[0]
        
        # Compute the percentage of missing values
        percentage = (missing / total) * 100
        percentage = percentage.round(2)  # Format percentage to two decimal places
        
        # Create a DataFrame with missing count and percentage
        df_missing = pd.DataFrame({'Missing Count': missing, 'Missing Percentage': percentage})
        
        # Sort the DataFrame by Missing Count in descending order
        df_missing = df_missing.sort_values(by='Missing Count', ascending=False)
        
        # Print the summary
        print("Missing Values Summary:n", df_missing)
    
    def summary_statistics(self):
        """
        Provide summary statistics of numerical columns.
        """
        print(self.data.describe().T)
    
    def split_columns(self, cat_th=10, car_th=20):
        """
        Splits the columns of the dataset into numerical, categorical, and high cardinality columns.
        
        Parameters:
            cat_th (int, optional): Threshold for unique values to consider a numerical column as categorical.
            car_th (int, optional): Threshold for unique values to consider a categorical column as high cardinality.
        
        Returns:
            cat_cols (list): List of categorical column names.
            num_cols (list): List of numerical column names.
            cat_but_car (list): List of categorical columns with high cardinality.
        """
        # Identify categorical columns (object dtype)
        cat_cols = [col for col in self.data.columns if self.data[col].dtype == "O"]
        
        # Identify numerical columns (non-object dtype)
        num_cols = [col for col in self.data.columns if self.data[col].dtype != "O"]
        
        # Identify numerical columns that should be treated as categorical
        num_but_cat = [col for col in num_cols if self.data[col].nunique() < cat_th]
        
        # Identify categorical columns with high cardinality
        cat_but_car = [col for col in cat_cols if self.data[col].nunique() > car_th]
        
        # Adjust categorical and numerical columns
        cat_cols = [col for col in cat_cols if col not in cat_but_car]
        num_cols = [col for col in num_cols if col not in num_but_cat]
        
        # Add numerical but categorical columns to categorical columns
        cat_cols += num_but_cat
        
        return cat_cols, num_cols, cat_but_car
    
    def handle_missing_values(self, strategy='drop', column=None, value=None):
        """
        Handle missing values using the specified strategy.
        
        Parameters:
            strategy (str): Strategy to handle missing values ('drop', 'fill').
            column (str): Specific column to handle (for 'fill' strategy).
            value (any): Value to fill missing data (for 'fill' strategy).
        """
        if strategy == 'drop':
            if column:
                self.data = self.data.dropna(subset=[column])
            else: 
                self.data = self.data.dropna()
        elif strategy == 'fill':
            if column is not None and value is not None:
                self.data[column].fillna(value, inplace=True)
            else:
                raise ValueError("For 'fill' strategy, column and value must be specified.")
        else:
            raise ValueError("Unsupported strategy. Use 'drop' or 'fill'.")
    
    def remove_duplicates(self):
        """
        Remove duplicate records from the dataset.
        """
        self.data = self.data.drop_duplicates()
    
    def detect_outliers(self, column):
        """
        Detect outliers in a specified numerical column using IQR.
        
        Parameters:
            column (str): Name of the numerical column
        
        Returns:
            pandas.Series: Boolean series indicating outliers.
        """
        Q1 = self.data[column].quantile(0.25)
        Q3 = self.data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_fence = Q1 - 1.5 * IQR
        upper_fence = Q3 + 1.5 * IQR
        return (self.data[column] < lower_fence) | (self.data[column] > upper_fence)
    
    def plot_histogram(self, column):
        """
        Plot a histogram for a specified numerical column.
        
        Parameters:
            column (str): Name of the numerical column.
        """
        plt.hist(self.data[column], bins=10, edgecolor='black')
        plt.title(f'Histogram of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()
    
    def plot_boxplot(self, column):
        """
        Plot a boxplot for a specified numerical column.
        
        Parameters:
            column (str): Name of the numerical column.
        """
        sns.boxplot(x=self.data[column])
        plt.title(f'Boxplot of {column}')
        plt.show()
    
    def plot_correlation_matrix(self):
        """
        Display a correlation matrix heatmap.
        """
        corr = self.data.corr()
        sns.heatmap(corr, annot=True, cmap='coolwarm')
        plt.title('Correlation Matrix')
        plt.show()
    
    def calculate_correlation(self, column1, column2):
        """
        Calculate the correlation between two numerical columns.
        
        Parameters:
            column1 (str): Name of the first numerical column.
            column2 (str): Name of the second numerical column.
        
        Returns:
            float: Correlation coefficient.
        """
        return self.data[column1].corr(self.data[column2])
    
    def perform_ttest(self, column, group_by_column):
        """
        Perform a t-test to compare means between groups.
        
        Parameters:
            column (str): Name of the numerical column.
            group_by_column (str): Name of the categorical column to group by.
        
        Returns:
            scipy.stats.ttest_indResult: T-test result.
        """
        from scipy import stats
        groups = self.data[group_by_column].unique()
        if len(groups) != 2:
            raise ValueError("Group by column must have exactly two groups for t-test.")
        group1 = self.data[self.data[group_by_column] == groups[0]][column]
        group2 = self.data[self.data[group_by_column] == groups[1]][column]
        return stats.ttest_ind(group1, group2)
    
    def scale_data(self, columns):
        """
        Scale specified numerical columns using standard scaling.
        
        Parameters:
            columns (list): List of column names to scale.
        """
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        self.data[columns] = scaler.fit_transform(self.data[columns])
    
    def encode_categorical(self, column):
        """
        Encode a categorical column using one-hot encoding.
        
        Parameters:
            column (str): Name of the categorical column.
        """
        self.data = pd.get_dummies(self.data, columns=[column], drop_first=True)
    
    def save_data(self, file_path, file_type='csv'):
        """
        Save the cleaned data to a specified file path.
        
        Parameters:
            file_path (str): Path to save the file.
            file_type (str): Type of the file ('csv', 'excel', 'json', etc.).
        """
        if file_type == 'csv':
            self.data.to_csv(file_path, index=False)
        elif file_type == 'excel':
            self.data.to_excel(file_path, index=False)
        elif file_type == 'json':
            self.data.to_json(file_path, orient='records')
        else:
            raise ValueError("Unsupported file type.")

In [64]:
eda = EDA(file_path="../Datasets/TMDB_movie_dataset_v11.csv")

In [65]:
eda.data.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [66]:
eda.data_types()

id                        int64
title                    object
vote_average            float64
vote_count                int64
status                   object
release_date             object
revenue                   int64
runtime                   int64
adult                      bool
backdrop_path            object
budget                    int64
homepage                 object
imdb_id                  object
original_language        object
original_title           object
overview                 object
popularity              float64
poster_path              object
tagline                  object
genres                   object
production_companies     object
production_countries     object
spoken_languages         object
keywords                 object
dtype: object


In [67]:
eda.data["release_date"] = pd.to_datetime(eda.data["release_date"])

In [68]:
cat_cols, num_cols, cat_but_car = eda.split_columns(20, 35)

In [70]:
print("Categorical Columns:", cat_cols)
print("Numerical Columns:", num_cols)
print("Categorical High Cardinality Columns:", cat_but_car)

Categorical Columns: ['status', 'adult']
Numerical Columns: ['id', 'vote_average', 'vote_count', 'release_date', 'revenue', 'runtime', 'budget', 'popularity']
Categorical High Cardinality Columns: ['title', 'backdrop_path', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'poster_path', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords']


In [55]:
spoken_lang_list = eda.data['spoken_languages'].dropna().values
spoken_lang_set = set(lang.strip() for spoken in spoken_lang_list for lang in spoken.split(", ")) # bunu  


In [56]:
list(spoken_lang_set)[:5]

['Ewe', 'Serbian', 'Tahitian', 'Malay', 'Burmese']

In [57]:
eda.data["original_language"].unique().tolist()[:5]

['en', 'ko', 'fr', 'ja', 'it']

In [30]:
languages = pd.read_csv("../Datasets/iso_639_1codes.csv") # dillerin kısaltmaları ve isimlerini içeren dosya bunu kullanarak dillerin isimlerini alacağız

In [31]:
languages.head()

Unnamed: 0,ISO_Code,Language
0,aa,Afar
1,ab,Abkhaz
2,af,Afrikaans
3,ak,Akan
4,am,Amharic


In [None]:
lang_map = dict(zip(languages["ISO_Code"], languages["Language"])) 
eda.data["original_language"] = eda.data["original_language"].replace(lang_map) # dillerin kısaltmalarını isimlerine çeviriyoruz

In [None]:
eda.data["original_language"].unique().tolist()[:5] 

['English', 'Korean', 'French', 'Japanese', 'Italian']

böyle daha iyi

In [None]:
eda.missing_values() 

Missing Values Summary:n                       Missing Count  Missing Percentage
homepage                    1018772               89.41
tagline                      979686               85.98
backdrop_path                837192               73.48
keywords                     831032               72.94
production_companies         625499               54.90
imdb_id                      536018               47.04
production_countries         505303               44.35
spoken_languages             486025               42.66
genres                       457074               40.12
poster_path                  360017               31.60
overview                     231670               20.33
release_date                 188032               16.50
date                         188032               16.50
original_title                   13                0.00
title                            13                0.00
id                                0                0.00
vote_count             

<br> kolonlarda homepage e kesinlikle ihtiyacımız yok

In [35]:
eda.data.drop(columns=["homepage"], axis=1, inplace=True) 

<br> title nan ise kaldıracağım zaten 13 satır kaybımız yok

In [36]:
eda.handle_missing_values('drop', 'title') 

In [37]:
before_filling = eda.data["genres"].isna().sum()

In [58]:
imdb_data = pd.read_csv("../Datasets/title.basics.tsv", sep="\t") # bunu kullanarak genres sütununu dolduracağız

  imdb_data = pd.read_csv("../Datasets/title.basics.tsv", sep="\t")


In [83]:
imdb_eda = EDA(imdb_data)
imdb_eda.data_types()

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object


In [None]:
imdb_eda.data.head()

In [86]:
# Step 1: Identify rows with missing genres but existing imdb_id
missing_genres = eda.data[(eda.data["genres"].isna()) & (eda.data["imdb_id"].notna())]

# Step 2: Extract imdb_ids that need genre data
imdb_ids_to_fill = missing_genres["imdb_id"].tolist()

# Step 3: Retrieve corresponding genres from imdb_data
key = imdb_data[imdb_data["tconst"].isin(imdb_ids_to_fill)][["tconst", "genres"]]

# Step 4: Merge eda.data with key on imdb_id and tconst
merged_df = eda.data.merge(key, left_on="imdb_id", right_on="tconst", how="left")

# Step 5: Fill missing genres in genres_y with genres_x
merged_df["genres_y"] = merged_df["genres_y"].fillna(merged_df["genres_x"])

# Step 6: Update the original genres column with filled data
eda.data["genres"] = merged_df["genres_y"]

In [87]:
before_filling - eda.data["genres"].isna().sum()  # genres kolonundan 150459 satır veri kurtardık

np.int64(150467)

In [None]:
type_key = imdb_eda.data.loc[imdb_data["tconst"].isin(eda.data["imdb_id"]), ["tconst", "titleType"]]
type_key.rename(columns={"tconst": "imdb_id"}, inplace=True)
eda.data = eda.data.merge(type_key, how="left", on="imdb_id")
eda.data["titleType"].value_counts() # bu kolonu da context_data kolonuna ekleyeceğim

In [89]:
eda.data.shape

(1139383, 24)

<br> overview, genres, tagline, keywords, production_companies, production_countriesi, spoken_language, original_language, release_date
<br> kolonlarının birleşiminden combined_features isimli bir kolon oluşturacağım ve nlp ye dayalı modelim bu kolon üzerinde çalışacak

In [121]:
eda.data["combined_features"] = eda.data["overview"] \
+ " " + eda.data["genres"].fillna("") \
+ " " + eda.data["tagline"].fillna("") \
+ " " + eda.data["keywords"].fillna("") \
+ " " + eda.data["production_companies"].fillna("") \
+ " " + eda.data["production_countries"].fillna("") \
+ " " + eda.data["spoken_languages"].fillna("") \
+ " " + eda.data["original_language"].fillna("") \
+ " " + eda.data["release_date"].fillna("")


In [90]:
eda.remove_duplicates()

In [91]:
eda.data.shape

(1139015, 24)

In [144]:
eda.data[eda.data["genres"].isna() & eda.data["imdb_id"].notna()]

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,...,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,titleType,combined_features
38983,338418,Nirvana: Smells Like Teen Spirit,7.400,29,Released,1991-01-01,0,0,False,0,...,2.468,/84PnF5DC9HMVRSNDm36uhekOfyZ.jpg,,,,,,,,
67007,193806,Thanksgiving Prayer,6.400,12,Released,1991-01-01,0,2,False,0,...,0.605,/bt1K9xTOmBR6gs7BTC1ycVzE6CQ.jpg,,,,United States of America,English,"politics, poetry, thanksgiving, short film",,An indictment ballad of all the different poli...
73788,919999,Remember the Time,7.545,11,Released,1992-02-02,0,9,False,0,...,1.205,/oa2iI6ZVPXd2Oh3zyEcVuzPomUU.jpg,,,,United States of America,,,,The second short film produced for Michael Jac...
85160,338289,Twisted Sister: We're Not Gonna Take It,7.100,8,Released,1984-01-01,0,0,False,0,...,0.600,,,,,,,,,
85988,204745,Falling for Scratte,5.600,8,Released,2009-10-27,0,9,False,0,...,1.832,/1sUk97R91ycRUV1ZzGHscWUHusq.jpg,,,Blue Sky Studios,United States of America,English,,,A behind the scenes video about Scrat and Scra...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1132604,744439,The Other Side,0.000,0,Released,,0,13,False,0,...,0.600,,,,,,,,,"A detective interrogates Monroe, a suspect in ..."
1136642,735537,Abudu: Lieju Kartybę,0.000,0,Released,2020-03-09,0,4,False,0,...,0.600,/biGyVrI1sa22MxQubf6rH45Kf3F.jpg,,,,,,,,Life's moments need to be let out. Engli...
1136977,735663,Static Shock,0.000,0,Planned,,0,0,False,0,...,0.625,/uoQNLBmhbs56YdOv7EJi77CTeVj.jpg,,,"Outlier Society Productions, Milestone Media",United States of America,English,,,A feature film based on the DC superhero. O...
1137056,737537,Marsden,0.000,0,In Production,,0,12,False,0,...,0.600,/5pmkWsC6bMosC3nbBoCtXI0ZeFV.jpg,WHAT DOES IT TAKE TO PULL THE TRIGGER?,,,,,,,"To his employers, Marsden is a reliable and ru..."


In [143]:
eda.missing_values()

Missing Values Summary:
                       Missing Count  Missing Percentage
tagline                      979348               85.98
keywords                     830735               72.93
production_companies         625219               54.89
titleType                    537832               47.22
imdb_id                      535744               47.04
production_countries         505011               44.34
spoken_languages             485742               42.65
poster_path                  352858               30.98
genres                       306404               26.90
overview                     231550               20.33
combined_features            231550               20.33
release_date                 187808               16.49
title                             0                0.00
vote_average                      0                0.00
id                                0                0.00
vote_count                        0                0.00
runtime                

In [110]:
eda.data[eda.data["poster_path"].isna() & eda.data["backdrop_path"].notna()].shape

(6922, 24)

<br> poster yerine backdrop resmini kullanabileceğim yaklaşık 7 bin veri var 
<br> uygulamamda sadece poster resmini kullanacağım için backdrop tan ihtiyacım olan eksik verileri çekip backdropu kaldıracağım

In [111]:
mask = eda.data["poster_path"].isna() & eda.data["backdrop_path"].notna()
eda.data["poster_path"] = eda.data["poster_path"].mask(mask, eda.data["backdrop_path"])
eda.data.drop(columns=["backdrop_path"], axis=1,inplace=True)