In [24]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pointbiserialr

In [25]:
df = pd.read_csv("AB_NYC_2019.csv")
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [26]:
print("Initial Data Overview:")
print(df.head())

Initial Data Overview:
     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !     4632   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     Private room    149               1

In [27]:
# 1. **Data Integrity:**

In [28]:
# Checking for invalid values (e.g., negative price or minimum_nights)
print("\nData Integrity Checks:")
# Price cannot be negative
invalid_price = df[df['price'] < 0]
print(f"Number of invalid price records: {len(invalid_price)}")
# Minimum nights should not be less than 1
invalid_nights = df[df['minimum_nights'] < 1]
print(f"Number of invalid minimum nights records: {len(invalid_nights)}")

# Remove invalid rows
df = df[df['price'] >= 0]
df = df[df['minimum_nights'] >= 1]


Data Integrity Checks:
Number of invalid price records: 0
Number of invalid minimum nights records: 0


In [29]:
# 2. **Missing Data Handling:**

In [30]:
# Checking for missing values
print("\nMissing Data Before Cleaning:")
print(df.isnull().sum())

# Handling missing values by imputing or dropping (example)
df['reviews_per_month'].fillna(0, inplace=True)  # Filling NaN in 'reviews_per_month' with 0
df.dropna(subset=['last_review'], inplace=True)  # Dropping rows where 'last_review' is missing


Missing Data Before Cleaning:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


In [31]:
# 3. **Duplicate Removal:**

In [32]:
# Checking for duplicates based on a unique identifier like 'id'
print("\nDuplicate Check:")
duplicates = df[df.duplicated(subset='id')]
print(f"Number of duplicate records: {len(duplicates)}")

# Remove duplicate records
df.drop_duplicates(subset='id', keep='first', inplace=True)


Duplicate Check:
Number of duplicate records: 0


In [33]:
# 4. **Standardization:**

In [34]:
# Example: Converting 'price' to consistent units, if necessary (assume in $)
# Also handling categorical values for 'room_type'
df['room_type'] = df['room_type'].str.strip().str.lower()  # Standardize room type formatting

# Example of standardizing formats for categorical variables (if any inconsistencies)
df['neighbourhood_group'] = df['neighbourhood_group'].str.title()
df['neighbourhood'] = df['neighbourhood'].str.title()

In [35]:
# 5. **Outlier Detection:**

In [37]:
# Using Z-score or IQR for detecting outliers in 'price' and 'reviews_per_month'
from scipy import stats

z_scores = stats.zscore(df['price'])
df['z_price'] = z_scores

# Filter for outliers where the Z-score is beyond a threshold (e.g., > 3 or < -3)
outliers = df[(df['z_price'] > 3) | (df['z_price'] < -3)]
print(f"\nOutliers detected in 'price': {len(outliers)}")

# Option to drop outliers
df = df[(df['z_price'] <= 3) & (df['z_price'] >= -3)]

# Drop the 'z_price' column as it's no longer needed
df.drop(columns=['z_price'], inplace=True)

# **Final cleaned dataset preview**
print("\nCleaned Data Overview:")
print(df.head())

# **Export the cleaned dataset to a new CSV file**
df.to_csv("cleaned_datasets.csv", index=False)
print("\nCleaned dataset saved as 'cleaned_dataset.csv'")


Outliers detected in 'price': 775

Cleaned Data Overview:
     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   
5  5099         Large Cozy 1 BR Apartment In Midtown East     7322   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   
5        Chris           Manhattan   Murray Hill  40.74767  -73.97500   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0    

In [39]:
df = pd.read_csv("Meta.csv")
df.head()

Unnamed: 0,Path,ClassId,ShapeId,ColorId,SignId
0,Meta/27.png,27,0,0,1.32
1,Meta/0.png,0,1,0,3.29
2,Meta/1.png,1,1,0,3.29
3,Meta/10.png,10,1,0,3.27
4,Meta/11.png,11,0,0,1.22


In [44]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Load the dataset
df = pd.read_csv("Meta.csv")

# 1. Data Integrity: Ensuring accuracy and consistency
print("Initial Data Types:\n", df.dtypes)
print("\nUnique values in 'SignId':\n", df['SignId'].unique())

# 2. Handling Missing Data
# Define a function to impute missing values based on column type
def impute_missing_values(df, column):
    if pd.api.types.is_numeric_dtype(df[column]):
        median_value = df[column].median()
        df[column].fillna(median_value, inplace=True)
        print(f"Filled missing values in '{column}' with median: {median_value}")
    else:
        mode_value = df[column].mode()[0]
        df[column].fillna(mode_value, inplace=True)
        print(f"Filled missing values in '{column}' with mode: {mode_value}")

# Apply the imputation function to relevant columns
for col in ['ClassId', 'ShapeId', 'ColorId', 'SignId']:
    impute_missing_values(df, col)

# Option 2: Drop rows with missing 'Path' (if critical)
initial_row_count = df.shape[0]
df.dropna(subset=['Path'], inplace=True)
dropped_rows = initial_row_count - df.shape[0]
print(f"Dropped {dropped_rows} rows with missing 'Path'.")

# 3. Duplicate Removal
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

# Remove duplicate rows
df = df.drop_duplicates()
print(f"Data shape after removing duplicates: {df.shape}")

# 4. Ensure columns are numeric for Z-score calculation
# Convert columns to numeric, coercing errors to NaN
for col in ['ClassId', 'ShapeId', 'ColorId', 'SignId']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check if there are any remaining non-numeric (NaN) values after conversion
print("Missing values after conversion to numeric:\n", df[['ClassId', 'ShapeId', 'ColorId', 'SignId']].isnull().sum())

# Optionally, impute or drop rows with remaining NaN values
df.dropna(subset=['ClassId', 'ShapeId', 'ColorId', 'SignId'], inplace=True)

# 5. Outlier Detection using Z-scores
numeric_cols = ['ClassId', 'ShapeId', 'ColorId', 'SignId']

# Calculate Z-scores and filter out outliers
z_scores = df[numeric_cols].apply(zscore)
outliers = (np.abs(z_scores) > 3).any(axis=1)
num_outliers = outliers.sum()
print(f"Number of outlier rows detected: {num_outliers}")

# Remove outliers
df_cleaned = df[~outliers].copy()
print(f"Data shape after removing outliers: {df_cleaned.shape}")

# 6. Final Cleaning Check
print("Missing values after cleaning:\n", df_cleaned.isnull().sum())

# Export the cleaned dataset
df_cleaned.to_csv("cleaned_dataset_Meta.csv", index=False)
print("Data cleaning process completed. Cleaned dataset exported as 'cleaned_dataset_Meta.csv'")

Initial Data Types:
 Path       object
ClassId     int64
ShapeId     int64
ColorId     int64
SignId     object
dtype: object

Unique values in 'SignId':
 ['1.32' '3.29' '3.27' '1.22' '2.3' '2.1' '2.2' '3.1' '3.3' '3.21' '1.39'
 '1.2' '1.1' '1.3.2' '1.13' '1.5.2' '1.37' '1.24' '1.33' '1.34' nan '1.36'
 '3.42' '4.2' '4.3' '4.1' '4.4' '4.5' '4.7' '4.8' '3.26' '3.28' '3.25']
Filled missing values in 'ClassId' with median: 21.0
Filled missing values in 'ShapeId' with median: 1.0
Filled missing values in 'ColorId' with median: 0.0
Filled missing values in 'SignId' with mode: 3.29
Dropped 0 rows with missing 'Path'.
Number of duplicate rows: 0
Data shape after removing duplicates: (43, 5)
Missing values after conversion to numeric:
 ClassId    0
ShapeId    0
ColorId    0
SignId     2
dtype: int64
Number of outlier rows detected: 1
Data shape after removing outliers: (40, 5)
Missing values after cleaning:
 Path       0
ClassId    0
ShapeId    0
ColorId    0
SignId     0
dtype: int64
Data clean

In [45]:
df = pd.read_csv("Test.csv")
df.head()

Unnamed: 0,Width,Height,Roi.X1,Roi.Y1,Roi.X2,Roi.Y2,ClassId,Path
0,53,54,6,5,48,49,16,Test/00000.png
1,42,45,5,5,36,40,1,Test/00001.png
2,48,52,6,6,43,47,38,Test/00002.png
3,27,29,5,5,22,24,33,Test/00003.png
4,60,57,5,5,55,52,11,Test/00004.png


In [46]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load your dataset
df = pd.read_csv("Test.csv")

# Display initial data info
print("Initial Data Information:")
print(df.info())
print(df.describe())

# 1. **Data Integrity** - Check for data types and range issues
# Convert Width, Height, ROI coordinates to numeric if needed
df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']] = df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']].apply(pd.to_numeric, errors='coerce')

# 2. **Missing Data Handling**
# Impute missing numeric data using mean or median
imputer = SimpleImputer(strategy='mean')  # Can change to 'median' if necessary
df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']] = imputer.fit_transform(df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']])

# Handle missing values in ClassId or Path (dropping if few, or filling with a default class)
df['ClassId'].fillna('Unknown', inplace=True)  # You can change this strategy if needed
df['Path'].fillna('No_Path', inplace=True)  # Replace with a valid placeholder or remove if necessary

# 3. **Duplicate Removal**
# Check for duplicate rows and remove them
df.drop_duplicates(inplace=True)

# 4. **Standardization** - Normalize or scale relevant numerical columns
scaler = StandardScaler()
df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']] = scaler.fit_transform(df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']])

# 5. **Outlier Detection**
# Identify and remove outliers using Z-score or IQR method

# Using Z-score method
from scipy import stats

# Calculate Z-scores
z_scores = np.abs(stats.zscore(df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']]))
outliers = (z_scores > 3).any(axis=1)  # Mark outliers where Z-score > 3

# Remove outliers from the dataset
df = df[~outliers]

# Final Dataset Check
print("Final Data Information:")
print(df.info())
print(df.describe())

# Save the cleaned dataset to a new CSV file
df.to_csv("cleaned_dataset_Test.csv", index=False)

Initial Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12630 entries, 0 to 12629
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Width    12630 non-null  int64 
 1   Height   12630 non-null  int64 
 2   Roi.X1   12630 non-null  int64 
 3   Roi.Y1   12630 non-null  int64 
 4   Roi.X2   12630 non-null  int64 
 5   Roi.Y2   12630 non-null  int64 
 6   ClassId  12630 non-null  int64 
 7   Path     12630 non-null  object
dtypes: int64(7), object(1)
memory usage: 789.5+ KB
None
              Width        Height        Roi.X1        Roi.Y1        Roi.X2  \
count  12630.000000  12630.000000  12630.000000  12630.000000  12630.000000   
mean      50.507759     50.364450      5.998021      5.982423     44.864450   
std       25.088483     23.698908      1.543954      1.427424     23.776102   
min       25.000000     25.000000      1.000000      5.000000     20.000000   
25%       34.000000     35.000000      5.000

In [47]:
df = pd.read_csv("Train.csv")
df.head()

Unnamed: 0,Width,Height,Roi.X1,Roi.Y1,Roi.X2,Roi.Y2,ClassId,Path
0,27,26,5,5,22,20,20,Train/20/00020_00000_00000.png
1,28,27,5,6,23,22,20,Train/20/00020_00000_00001.png
2,29,26,6,5,24,21,20,Train/20/00020_00000_00002.png
3,28,27,5,6,23,22,20,Train/20/00020_00000_00003.png
4,28,26,5,5,23,21,20,Train/20/00020_00000_00004.png


In [48]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load your dataset
df = pd.read_csv("Train.csv")

# Display initial data info
print("Initial Data Information:")
print(df.info())
print(df.describe())

# 1. **Data Integrity** - Check for data types and range issues
# Convert Width, Height, ROI coordinates to numeric if needed
df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']] = df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']].apply(pd.to_numeric, errors='coerce')

# 2. **Missing Data Handling**
# Impute missing numeric data using mean or median
imputer = SimpleImputer(strategy='mean')  # Can change to 'median' if necessary
df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']] = imputer.fit_transform(df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']])

# Handle missing values in ClassId or Path (dropping if few, or filling with a default class)
df['ClassId'].fillna('Unknown', inplace=True)  # You can change this strategy if needed
df['Path'].fillna('No_Path', inplace=True)  # Replace with a valid placeholder or remove if necessary

# 3. **Duplicate Removal**
# Check for duplicate rows and remove them
df.drop_duplicates(inplace=True)

# 4. **Standardization** - Normalize or scale relevant numerical columns
scaler = StandardScaler()
df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']] = scaler.fit_transform(df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']])

# 5. **Outlier Detection**
# Identify and remove outliers using Z-score or IQR method

# Using Z-score method
from scipy import stats

# Calculate Z-scores
z_scores = np.abs(stats.zscore(df[['Width', 'Height', 'Roi.X1', 'Roi.Y1', 'Roi.X2', 'Roi.Y2']]))
outliers = (z_scores > 3).any(axis=1)  # Mark outliers where Z-score > 3

# Remove outliers from the dataset
df = df[~outliers]

# Final Dataset Check
print("Final Data Information:")
print(df.info())
print(df.describe())

# Save the cleaned dataset to a new CSV file
df.to_csv("cleaned_dataset_Train.csv", index=False)

Initial Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39209 entries, 0 to 39208
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Width    39209 non-null  int64 
 1   Height   39209 non-null  int64 
 2   Roi.X1   39209 non-null  int64 
 3   Roi.Y1   39209 non-null  int64 
 4   Roi.X2   39209 non-null  int64 
 5   Roi.Y2   39209 non-null  int64 
 6   ClassId  39209 non-null  int64 
 7   Path     39209 non-null  object
dtypes: int64(7), object(1)
memory usage: 2.4+ MB
None
              Width        Height        Roi.X1        Roi.Y1        Roi.X2  \
count  39209.000000  39209.000000  39209.000000  39209.000000  39209.000000   
mean      50.835880     50.328930      5.999515      5.962381     45.197302   
std       24.306933     23.115423      1.475493      1.385440     23.060157   
min       25.000000     25.000000      0.000000      5.000000     20.000000   
25%       35.000000     35.000000      5.00000

In [49]:
df = pd.read_csv("CAvideos.csv")
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10T17:00:03.000Z,"Eminem|""Walk""|""On""|""Water""|""Aftermath/Shady/In...",17158579,787425,43420,125882,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...
1,0dBIkQ4Mz1M,17.14.11,PLUSH - Bad Unboxing Fan Mail,iDubbbzTV,23,2017-11-13T17:00:00.000Z,"plush|""bad unboxing""|""unboxing""|""fan mail""|""id...",1014651,127794,1688,13030,https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg,False,False,False,STill got a lot of packages. Probably will las...
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146035,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095828,132239,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...
4,2Vv-BfVoq4g,17.14.11,Ed Sheeran - Perfect (Official Music Video),Ed Sheeran,10,2017-11-09T11:04:14.000Z,"edsheeran|""ed sheeran""|""acoustic""|""live""|""cove...",33523622,1634130,21082,85067,https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg,False,False,False,🎧: https://ad.gt/yt-perfect\n💰: https://atlant...


In [51]:
import pandas as pd

# Load the dataset (assume it's a CSV for now)
df = pd.read_csv('CAvideos.csv')

### 1. Data Integrity Check ###

# Check the basic structure of the dataset
print(df.info())  # Check for missing values and data types
print(df.describe())  # Get descriptive statistics for numerical columns

# Check for unique values in categorical fields
print(df['category_id'].unique())  # Unique categories
print(df['comments_disabled'].unique())  # Ensure comments_disabled is either True/False

### 2. Handling Missing Data ###

# Check for missing data in each column
print(df.isnull().sum())

# Fill missing values (Example: fill missing 'description' with 'No description')
df['description'].fillna('No description', inplace=True)

# Alternatively, drop rows with missing data in important columns like 'views', 'likes'
df.dropna(subset=['views', 'likes', 'dislikes'], inplace=True)

### 3. Duplicate Removal ###

# Check for duplicates
duplicates = df.duplicated()
print(f'Duplicates: {duplicates.sum()}')

# Remove duplicates
df.drop_duplicates(inplace=True)

### 4. Standardization ###

# Standardize 'publish_time' and 'trending_date' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

# Standardize text columns by trimming spaces and lowercasing
df['title'] = df['title'].str.strip().str.lower()
df['channel_title'] = df['channel_title'].str.strip().str.lower()

### 5. Outlier Detection ###

# Check for outliers in numerical columns such as 'views', 'likes', 'dislikes'
import numpy as np

# Calculate the Z-score to identify outliers
from scipy import stats
df['views_zscore'] = np.abs(stats.zscore(df['views']))
df['likes_zscore'] = np.abs(stats.zscore(df['likes']))

# Set a threshold for outliers (e.g., Z-score > 3)
outliers_views = df[df['views_zscore'] > 3]
outliers_likes = df[df['likes_zscore'] > 3]

print(f'Outliers in Views: {outliers_views.shape[0]}')
print(f'Outliers in Likes: {outliers_likes.shape[0]}')

# Optionally remove outliers
df = df[(df['views_zscore'] <= 3) & (df['likes_zscore'] <= 3)]

# Drop the Z-score columns as they are no longer needed
df.drop(columns=['views_zscore', 'likes_zscore'], inplace=True)

### Final Validation ###

# After cleaning, recheck the data
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_dataset_CAvideos.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40881 entries, 0 to 40880
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                40881 non-null  object
 1   trending_date           40881 non-null  object
 2   title                   40881 non-null  object
 3   channel_title           40881 non-null  object
 4   category_id             40881 non-null  int64 
 5   publish_time            40881 non-null  object
 6   tags                    40881 non-null  object
 7   views                   40881 non-null  int64 
 8   likes                   40881 non-null  int64 
 9   dislikes                40881 non-null  int64 
 10  comment_count           40881 non-null  int64 
 11  thumbnail_link          40881 non-null  object
 12  comments_disabled       40881 non-null  bool  
 13  ratings_disabled        40881 non-null  bool  
 14  video_error_or_removed  40881 non-null  bool  
 15  de

In [52]:
df = pd.read_csv("DEvideos.csv")
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,LgVi6y5QIjM,17.14.11,Sing zu Ende! | Gesangseinlagen vom Feinsten |...,inscope21,24,2017-11-13T17:08:49.000Z,"inscope21|""sing zu ende""|""gesangseinlagen""|""ge...",252786,35885,230,1539,https://i.ytimg.com/vi/LgVi6y5QIjM/default.jpg,False,False,False,Heute gibt es mal wieder ein neues Format... w...
1,Bayt7uQith4,17.14.11,Kinder ferngesteuert im Kiosk! Erwachsene abzo...,LUKE! Die Woche und ich,23,2017-11-12T22:30:01.000Z,"Kinder|""ferngesteuert""|""Kinder ferngesteuert""|...",797196,53576,302,1278,https://i.ytimg.com/vi/Bayt7uQith4/default.jpg,False,False,False,Kinder ferngesteuert! Kinder lassen sich sooo ...
2,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97190,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
3,AHtypnRk7JE,17.14.11,Das Fermi-Paradoxon,100SekundenPhysik,27,2017-11-12T15:00:01.000Z,"Physik|""Wissenschaft""|""Technik""|""Science-Ficti...",380247,31821,458,1955,https://i.ytimg.com/vi/AHtypnRk7JE/default.jpg,False,False,False,►Alle Videos: http://bit.ly/1fa7Tw3\n\n\n✚Snap...
4,ZJ9We4bjcg0,17.14.11,18 SONGS mit Kelly MissesVlog (Sing-off),rezo,24,2017-11-12T13:10:36.000Z,"kelly|""missesvlog""|""kelly song""|""bausa""|""bausa...",822213,100684,2467,10244,https://i.ytimg.com/vi/ZJ9We4bjcg0/default.jpg,False,False,False,18 Song Mashup über den (veränderten) Beat von...


In [53]:
import pandas as pd

# Load the dataset (assume it's a CSV for now)
df = pd.read_csv('DEvideos.csv')

### 1. Data Integrity Check ###

# Check the basic structure of the dataset
print(df.info())  # Check for missing values and data types
print(df.describe())  # Get descriptive statistics for numerical columns

# Check for unique values in categorical fields
print(df['category_id'].unique())  # Unique categories
print(df['comments_disabled'].unique())  # Ensure comments_disabled is either True/False

### 2. Handling Missing Data ###

# Check for missing data in each column
print(df.isnull().sum())

# Fill missing values (Example: fill missing 'description' with 'No description')
df['description'].fillna('No description', inplace=True)

# Alternatively, drop rows with missing data in important columns like 'views', 'likes'
df.dropna(subset=['views', 'likes', 'dislikes'], inplace=True)

### 3. Duplicate Removal ###

# Check for duplicates
duplicates = df.duplicated()
print(f'Duplicates: {duplicates.sum()}')

# Remove duplicates
df.drop_duplicates(inplace=True)

### 4. Standardization ###

# Standardize 'publish_time' and 'trending_date' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

# Standardize text columns by trimming spaces and lowercasing
df['title'] = df['title'].str.strip().str.lower()
df['channel_title'] = df['channel_title'].str.strip().str.lower()

### 5. Outlier Detection ###

# Check for outliers in numerical columns such as 'views', 'likes', 'dislikes'
import numpy as np

# Calculate the Z-score to identify outliers
from scipy import stats
df['views_zscore'] = np.abs(stats.zscore(df['views']))
df['likes_zscore'] = np.abs(stats.zscore(df['likes']))

# Set a threshold for outliers (e.g., Z-score > 3)
outliers_views = df[df['views_zscore'] > 3]
outliers_likes = df[df['likes_zscore'] > 3]

print(f'Outliers in Views: {outliers_views.shape[0]}')
print(f'Outliers in Likes: {outliers_likes.shape[0]}')

# Optionally remove outliers
df = df[(df['views_zscore'] <= 3) & (df['likes_zscore'] <= 3)]

# Drop the Z-score columns as they are no longer needed
df.drop(columns=['views_zscore', 'likes_zscore'], inplace=True)

### Final Validation ###

# After cleaning, recheck the data
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_dataset_DEvideos.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40840 entries, 0 to 40839
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                40840 non-null  object
 1   trending_date           40840 non-null  object
 2   title                   40840 non-null  object
 3   channel_title           40840 non-null  object
 4   category_id             40840 non-null  int64 
 5   publish_time            40840 non-null  object
 6   tags                    40840 non-null  object
 7   views                   40840 non-null  int64 
 8   likes                   40840 non-null  int64 
 9   dislikes                40840 non-null  int64 
 10  comment_count           40840 non-null  int64 
 11  thumbnail_link          40840 non-null  object
 12  comments_disabled       40840 non-null  bool  
 13  ratings_disabled        40840 non-null  bool  
 14  video_error_or_removed  40840 non-null  bool  
 15  de

In [54]:
df = pd.read_csv("FRvideos.csv")
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,Ro6eob0LrCY,17.14.11,Malika LePen : Femme de Gauche - Trailer,Le Raptor Dissident,24,2017-11-13T17:32:55.000Z,"Raptor""|""Dissident""|""Expliquez""|""moi""|""cette""|...",212702,29282,1108,3817,https://i.ytimg.com/vi/Ro6eob0LrCY/default.jpg,False,False,False,Dimanche.\n18h30.\nSoyez présents pour la vidé...
1,Yo84eqYwP98,17.14.11,"LA PIRE PARTIE ft Le Rire Jaune, Pierre Croce,...",Le Labo,24,2017-11-12T15:00:02.000Z,[none],432721,14053,576,1161,https://i.ytimg.com/vi/Yo84eqYwP98/default.jpg,False,False,False,Le jeu de société: https://goo.gl/hhG1Ta\n\nGa...
2,ceqntSXE-10,17.14.11,DESSINS ANIMÉS FRANÇAIS VS RUSSES 2 - Daniil...,Daniil le Russe,23,2017-11-13T17:00:38.000Z,"cartoon""|""pokémon""|""école""|""ours""|""мультфильм",482153,76203,477,9580,https://i.ytimg.com/vi/ceqntSXE-10/default.jpg,False,False,False,Une nouvelle dose de dessins animés français e...
3,WuTFI5qftCE,17.14.11,PAPY GRENIER - METAL GEAR SOLID,Joueur Du Grenier,20,2017-11-12T17:00:02.000Z,"Papy grenier""|""Metal Gear Solid""|""PS1""|""Tirage...",925222,85016,550,4303,https://i.ytimg.com/vi/WuTFI5qftCE/default.jpg,False,False,False,"Nouvel ,épisode de Papy Grenier ! Ce mois-ci o..."
4,ee6OFs8TdEg,17.14.11,QUI SAUTERA LE PLUS HAUT ? (VÉLO SKATE ROLLER ...,Aurelien Fontenoy,17,2017-11-13T16:30:03.000Z,"vélo""|""vtt""|""bmx""|""freestyle""|""bike""|""mtb""|""di...",141695,8091,72,481,https://i.ytimg.com/vi/ee6OFs8TdEg/default.jpg,False,False,False,Sauts à plus de 4 mètres de haut dans un tramp...


In [55]:
import pandas as pd

# Load the dataset (assume it's a CSV for now)
df = pd.read_csv('FRvideos.csv')

### 1. Data Integrity Check ###

# Check the basic structure of the dataset
print(df.info())  # Check for missing values and data types
print(df.describe())  # Get descriptive statistics for numerical columns

# Check for unique values in categorical fields
print(df['category_id'].unique())  # Unique categories
print(df['comments_disabled'].unique())  # Ensure comments_disabled is either True/False

### 2. Handling Missing Data ###

# Check for missing data in each column
print(df.isnull().sum())

# Fill missing values (Example: fill missing 'description' with 'No description')
df['description'].fillna('No description', inplace=True)

# Alternatively, drop rows with missing data in important columns like 'views', 'likes'
df.dropna(subset=['views', 'likes', 'dislikes'], inplace=True)

### 3. Duplicate Removal ###

# Check for duplicates
duplicates = df.duplicated()
print(f'Duplicates: {duplicates.sum()}')

# Remove duplicates
df.drop_duplicates(inplace=True)

### 4. Standardization ###

# Standardize 'publish_time' and 'trending_date' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

# Standardize text columns by trimming spaces and lowercasing
df['title'] = df['title'].str.strip().str.lower()
df['channel_title'] = df['channel_title'].str.strip().str.lower()

### 5. Outlier Detection ###

# Check for outliers in numerical columns such as 'views', 'likes', 'dislikes'
import numpy as np

# Calculate the Z-score to identify outliers
from scipy import stats
df['views_zscore'] = np.abs(stats.zscore(df['views']))
df['likes_zscore'] = np.abs(stats.zscore(df['likes']))

# Set a threshold for outliers (e.g., Z-score > 3)
outliers_views = df[df['views_zscore'] > 3]
outliers_likes = df[df['likes_zscore'] > 3]

print(f'Outliers in Views: {outliers_views.shape[0]}')
print(f'Outliers in Likes: {outliers_likes.shape[0]}')

# Optionally remove outliers
df = df[(df['views_zscore'] <= 3) & (df['likes_zscore'] <= 3)]

# Drop the Z-score columns as they are no longer needed
df.drop(columns=['views_zscore', 'likes_zscore'], inplace=True)

### Final Validation ###

# After cleaning, recheck the data
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_dataset_FRvideos.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40724 entries, 0 to 40723
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                40724 non-null  object
 1   trending_date           40724 non-null  object
 2   title                   40724 non-null  object
 3   channel_title           40724 non-null  object
 4   category_id             40724 non-null  int64 
 5   publish_time            40724 non-null  object
 6   tags                    40724 non-null  object
 7   views                   40724 non-null  int64 
 8   likes                   40724 non-null  int64 
 9   dislikes                40724 non-null  int64 
 10  comment_count           40724 non-null  int64 
 11  thumbnail_link          40724 non-null  object
 12  comments_disabled       40724 non-null  bool  
 13  ratings_disabled        40724 non-null  bool  
 14  video_error_or_removed  40724 non-null  bool  
 15  de

In [56]:
df = pd.read_csv("GBvideos.csv")
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,Jw1Y-zhQURU,17.14.11,John Lewis Christmas Ad 2017 - #MozTheMonster,John Lewis,26,2017-11-10T07:38:29.000Z,"christmas|""john lewis christmas""|""john lewis""|...",7224515,55681,10247,9479,https://i.ytimg.com/vi/Jw1Y-zhQURU/default.jpg,False,False,False,Click here to continue the story and make your...
1,3s1rvMFUweQ,17.14.11,Taylor Swift: …Ready for It? (Live) - SNL,Saturday Night Live,24,2017-11-12T06:24:44.000Z,"SNL|""Saturday Night Live""|""SNL Season 43""|""Epi...",1053632,25561,2294,2757,https://i.ytimg.com/vi/3s1rvMFUweQ/default.jpg,False,False,False,Musical guest Taylor Swift performs …Ready for...
2,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10T17:00:03.000Z,"Eminem|""Walk""|""On""|""Water""|""Aftermath/Shady/In...",17158579,787420,43420,125882,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...
3,PUTEiSjKwJU,17.14.11,Goals from Salford City vs Class of 92 and Fri...,Salford City Football Club,17,2017-11-13T02:30:38.000Z,"Salford City FC|""Salford City""|""Salford""|""Clas...",27833,193,12,37,https://i.ytimg.com/vi/PUTEiSjKwJU/default.jpg,False,False,False,Salford drew 4-4 against the Class of 92 and F...
4,rHwDegptbI4,17.14.11,Dashcam captures truck's near miss with child ...,Cute Girl Videos,25,2017-11-13T01:45:13.000Z,[none],9815,30,2,30,https://i.ytimg.com/vi/rHwDegptbI4/default.jpg,False,False,False,Dashcam captures truck's near miss with child ...


In [57]:
import pandas as pd

# Load the dataset (assume it's a CSV for now)
df = pd.read_csv('GBvideos.csv')

### 1. Data Integrity Check ###

# Check the basic structure of the dataset
print(df.info())  # Check for missing values and data types
print(df.describe())  # Get descriptive statistics for numerical columns

# Check for unique values in categorical fields
print(df['category_id'].unique())  # Unique categories
print(df['comments_disabled'].unique())  # Ensure comments_disabled is either True/False

### 2. Handling Missing Data ###

# Check for missing data in each column
print(df.isnull().sum())

# Fill missing values (Example: fill missing 'description' with 'No description')
df['description'].fillna('No description', inplace=True)

# Alternatively, drop rows with missing data in important columns like 'views', 'likes'
df.dropna(subset=['views', 'likes', 'dislikes'], inplace=True)

### 3. Duplicate Removal ###

# Check for duplicates
duplicates = df.duplicated()
print(f'Duplicates: {duplicates.sum()}')

# Remove duplicates
df.drop_duplicates(inplace=True)

### 4. Standardization ###

# Standardize 'publish_time' and 'trending_date' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

# Standardize text columns by trimming spaces and lowercasing
df['title'] = df['title'].str.strip().str.lower()
df['channel_title'] = df['channel_title'].str.strip().str.lower()

### 5. Outlier Detection ###

# Check for outliers in numerical columns such as 'views', 'likes', 'dislikes'
import numpy as np

# Calculate the Z-score to identify outliers
from scipy import stats
df['views_zscore'] = np.abs(stats.zscore(df['views']))
df['likes_zscore'] = np.abs(stats.zscore(df['likes']))

# Set a threshold for outliers (e.g., Z-score > 3)
outliers_views = df[df['views_zscore'] > 3]
outliers_likes = df[df['likes_zscore'] > 3]

print(f'Outliers in Views: {outliers_views.shape[0]}')
print(f'Outliers in Likes: {outliers_likes.shape[0]}')

# Optionally remove outliers
df = df[(df['views_zscore'] <= 3) & (df['likes_zscore'] <= 3)]

# Drop the Z-score columns as they are no longer needed
df.drop(columns=['views_zscore', 'likes_zscore'], inplace=True)

### Final Validation ###

# After cleaning, recheck the data
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_dataset_GBvideos.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38916 entries, 0 to 38915
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                38916 non-null  object
 1   trending_date           38916 non-null  object
 2   title                   38916 non-null  object
 3   channel_title           38916 non-null  object
 4   category_id             38916 non-null  int64 
 5   publish_time            38916 non-null  object
 6   tags                    38916 non-null  object
 7   views                   38916 non-null  int64 
 8   likes                   38916 non-null  int64 
 9   dislikes                38916 non-null  int64 
 10  comment_count           38916 non-null  int64 
 11  thumbnail_link          38916 non-null  object
 12  comments_disabled       38916 non-null  bool  
 13  ratings_disabled        38916 non-null  bool  
 14  video_error_or_removed  38916 non-null  bool  
 15  de

In [58]:
df = pd.read_csv("INvideos.csv")
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,kzwfHumJyYc,17.14.11,Sharry Mann: Cute Munda ( Song Teaser) | Parmi...,Lokdhun Punjabi,1,2017-11-12T12:20:39.000Z,"sharry mann|""sharry mann new song""|""sharry man...",1096327,33966,798,882,https://i.ytimg.com/vi/kzwfHumJyYc/default.jpg,False,False,False,Presenting Sharry Mann latest Punjabi Song Cu...
1,zUZ1z7FwLc8,17.14.11,"पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं...",HJ NEWS,25,2017-11-13T05:43:56.000Z,"पीरियड्स के समय|""पेट पर पति करता ऐसा""|""देखकर द...",590101,735,904,0,https://i.ytimg.com/vi/zUZ1z7FwLc8/default.jpg,True,False,False,"पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं..."
2,10L1hZ9qa58,17.14.11,Stylish Star Allu Arjun @ ChaySam Wedding Rece...,TFPC,24,2017-11-12T15:48:08.000Z,Stylish Star Allu Arjun @ ChaySam Wedding Rece...,473988,2011,243,149,https://i.ytimg.com/vi/10L1hZ9qa58/default.jpg,False,False,False,Watch Stylish Star Allu Arjun @ ChaySam Weddin...
3,N1vE8iiEg64,17.14.11,Eruma Saani | Tamil vs English,Eruma Saani,23,2017-11-12T07:08:48.000Z,"Eruma Saani|""Tamil Comedy Videos""|""Films""|""Mov...",1242680,70353,1624,2684,https://i.ytimg.com/vi/N1vE8iiEg64/default.jpg,False,False,False,This video showcases the difference between pe...
4,kJzGH0PVQHQ,17.14.11,why Samantha became EMOTIONAL @ Samantha naga ...,Filmylooks,24,2017-11-13T01:14:16.000Z,"Filmylooks|""latest news""|""telugu movies""|""telu...",464015,492,293,66,https://i.ytimg.com/vi/kJzGH0PVQHQ/default.jpg,False,False,False,why Samantha became EMOTIONAL @ Samantha naga ...


In [59]:
import pandas as pd

# Load the dataset (assume it's a CSV for now)
df = pd.read_csv('INvideos.csv')

### 1. Data Integrity Check ###

# Check the basic structure of the dataset
print(df.info())  # Check for missing values and data types
print(df.describe())  # Get descriptive statistics for numerical columns

# Check for unique values in categorical fields
print(df['category_id'].unique())  # Unique categories
print(df['comments_disabled'].unique())  # Ensure comments_disabled is either True/False

### 2. Handling Missing Data ###

# Check for missing data in each column
print(df.isnull().sum())

# Fill missing values (Example: fill missing 'description' with 'No description')
df['description'].fillna('No description', inplace=True)

# Alternatively, drop rows with missing data in important columns like 'views', 'likes'
df.dropna(subset=['views', 'likes', 'dislikes'], inplace=True)

### 3. Duplicate Removal ###

# Check for duplicates
duplicates = df.duplicated()
print(f'Duplicates: {duplicates.sum()}')

# Remove duplicates
df.drop_duplicates(inplace=True)

### 4. Standardization ###

# Standardize 'publish_time' and 'trending_date' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

# Standardize text columns by trimming spaces and lowercasing
df['title'] = df['title'].str.strip().str.lower()
df['channel_title'] = df['channel_title'].str.strip().str.lower()

### 5. Outlier Detection ###

# Check for outliers in numerical columns such as 'views', 'likes', 'dislikes'
import numpy as np

# Calculate the Z-score to identify outliers
from scipy import stats
df['views_zscore'] = np.abs(stats.zscore(df['views']))
df['likes_zscore'] = np.abs(stats.zscore(df['likes']))

# Set a threshold for outliers (e.g., Z-score > 3)
outliers_views = df[df['views_zscore'] > 3]
outliers_likes = df[df['likes_zscore'] > 3]

print(f'Outliers in Views: {outliers_views.shape[0]}')
print(f'Outliers in Likes: {outliers_likes.shape[0]}')

# Optionally remove outliers
df = df[(df['views_zscore'] <= 3) & (df['likes_zscore'] <= 3)]

# Drop the Z-score columns as they are no longer needed
df.drop(columns=['views_zscore', 'likes_zscore'], inplace=True)

### Final Validation ###

# After cleaning, recheck the data
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_dataset_INvideos.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37352 entries, 0 to 37351
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                37352 non-null  object
 1   trending_date           37352 non-null  object
 2   title                   37352 non-null  object
 3   channel_title           37352 non-null  object
 4   category_id             37352 non-null  int64 
 5   publish_time            37352 non-null  object
 6   tags                    37352 non-null  object
 7   views                   37352 non-null  int64 
 8   likes                   37352 non-null  int64 
 9   dislikes                37352 non-null  int64 
 10  comment_count           37352 non-null  int64 
 11  thumbnail_link          37352 non-null  object
 12  comments_disabled       37352 non-null  bool  
 13  ratings_disabled        37352 non-null  bool  
 14  video_error_or_removed  37352 non-null  bool  
 15  de

In [65]:
df = pd.read_csv('JPvideos.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,5ugKfHgsmYw,18.07.02,é¸èªããªãåç´ã«è½ä¸ï¼è·¯ä¸ã®è»ã...,æäºéä¿¡æ åã»ã³ã¿ã¼,25,2018-02-06T03:04:37.000Z,"äºæ|""ä½è³""|""ä½è³ç""|""ããªã³ãã¿ã...",188085,591,189,0,https://i.ytimg.com/vi/5ugKfHgsmYw/default.jpg,True,False,False,ä½è³çç¥å¼å¸ã®æ°å®¶ã«å¢è½ããé¸ä...
1,ohObafdd34Y,18.07.02,ã¤ããQ ãç¥­ãç·å®®å·Ãæè¶ å·¨å¤§ã...,ç¥è°·ãããª Kamiya Erina 2,1,2018-02-06T04:01:56.000Z,[none],90929,442,88,174,https://i.ytimg.com/vi/ohObafdd34Y/default.jpg,False,False,False,
2,aBr2kKAHN6M,18.07.02,Live Views of Starman,SpaceX,28,2018-02-06T21:38:22.000Z,[none],6408303,165892,2331,3006,https://i.ytimg.com/vi/aBr2kKAHN6M/default.jpg,False,False,False,
3,5wNnwChvmsQ,18.07.02,æ±äº¬ãã£ãºãã¼ãªã¾ã¼ãã®åã­ã£ã...,ã¢ã·ã¿ãã¯ãã¤,25,2018-02-06T06:08:49.000Z,ã¢ã·ã¿ãã¯ãã¤,96255,1165,277,545,https://i.ytimg.com/vi/5wNnwChvmsQ/default.jpg,False,False,False,æ±äº¬ãã£ãºãã¼ãªã¾ã¼ãã®åã­ã£ã...
4,B7J47qFvdsk,18.07.02,æ¦®åå¥ããè¡æã®æ­»ãã ãµãï¼æ ç...,ã·ãããã¥ãã¤,1,2018-02-06T02:30:00.000Z,[none],108408,1336,74,201,https://i.ytimg.com/vi/B7J47qFvdsk/default.jpg,False,False,False,å®¶ã«å¸°ã£ã¦ãããµã©ãªã¼ãã³ã®ãã...


In [67]:
import pandas as pd

# Load the dataset (assume it's a CSV for now)
df = pd.read_csv('JPvideos.csv', encoding='ISO-8859-1')

### 1. Data Integrity Check ###

# Check the basic structure of the dataset
print(df.info())  # Check for missing values and data types
print(df.describe())  # Get descriptive statistics for numerical columns

# Check for unique values in categorical fields
print(df['category_id'].unique())  # Unique categories
print(df['comments_disabled'].unique())  # Ensure comments_disabled is either True/False

### 2. Handling Missing Data ###

# Check for missing data in each column
print(df.isnull().sum())

# Fill missing values (Example: fill missing 'description' with 'No description')
df['description'].fillna('No description', inplace=True)

# Alternatively, drop rows with missing data in important columns like 'views', 'likes'
df.dropna(subset=['views', 'likes', 'dislikes'], inplace=True)

### 3. Duplicate Removal ###

# Check for duplicates
duplicates = df.duplicated()
print(f'Duplicates: {duplicates.sum()}')

# Remove duplicates
df.drop_duplicates(inplace=True)

### 4. Standardization ###

# Standardize 'publish_time' and 'trending_date' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

# Standardize text columns by trimming spaces and lowercasing
df['title'] = df['title'].str.strip().str.lower()
df['channel_title'] = df['channel_title'].str.strip().str.lower()

### 5. Outlier Detection ###

# Check for outliers in numerical columns such as 'views', 'likes', 'dislikes'
import numpy as np

# Calculate the Z-score to identify outliers
from scipy import stats
df['views_zscore'] = np.abs(stats.zscore(df['views']))
df['likes_zscore'] = np.abs(stats.zscore(df['likes']))

# Set a threshold for outliers (e.g., Z-score > 3)
outliers_views = df[df['views_zscore'] > 3]
outliers_likes = df[df['likes_zscore'] > 3]

print(f'Outliers in Views: {outliers_views.shape[0]}')
print(f'Outliers in Likes: {outliers_likes.shape[0]}')

# Optionally remove outliers
df = df[(df['views_zscore'] <= 3) & (df['likes_zscore'] <= 3)]

# Drop the Z-score columns as they are no longer needed
df.drop(columns=['views_zscore', 'likes_zscore'], inplace=True)

### Final Validation ###

# After cleaning, recheck the data
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_dataset_JPvideos.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20523 entries, 0 to 20522
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                20523 non-null  object
 1   trending_date           20523 non-null  object
 2   title                   20523 non-null  object
 3   channel_title           20523 non-null  object
 4   category_id             20523 non-null  int64 
 5   publish_time            20523 non-null  object
 6   tags                    20523 non-null  object
 7   views                   20523 non-null  int64 
 8   likes                   20523 non-null  int64 
 9   dislikes                20523 non-null  int64 
 10  comment_count           20523 non-null  int64 
 11  thumbnail_link          20523 non-null  object
 12  comments_disabled       20523 non-null  bool  
 13  ratings_disabled        20523 non-null  bool  
 14  video_error_or_removed  20523 non-null  bool  
 15  de

In [69]:
df = pd.read_csv('KRvideos.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,RxGQe4EeEpA,17.14.11,ì¢ì by ë¯¼ì_ì¤ì¢ì _ì¢ë ëµê°,ë¼í¸ë§ì½ë¦¬ì,22,2017-11-13T07:07:36.000Z,"ë¼í¸ë§|""ì¤ì¢ì ""|""ì¢ë""|""ì¢ì""|""ì¬ë ...",156130,1422,40,272,https://i.ytimg.com/vi/RxGQe4EeEpA/default.jpg,False,False,False,ì¤ì¢ì 'ì¢ë'ì ëµê° 'ì¢ì' ìµì´ ê...
1,hH7wVE8OlQ0,17.14.11,JSA ê·ì ë¶íêµ° ì´ê²© ë¶ì,Edward,25,2017-11-13T10:59:16.000Z,"JSA|""ê·ì""|""ë¶íêµ°""|""ì´ê²©""|""ë¶ì""|""JS...",76533,211,28,113,https://i.ytimg.com/vi/hH7wVE8OlQ0/default.jpg,False,False,False,[ì±ëAë¨ë]å ë³ì¬ íì¬ 'ììë¶ëª...
2,9V8bnWUmE9U,17.14.11,ëëª°ë¼í¨ë°ë¦¬ ì´ëí ìì 2í (ë¹¼ë...,ëëª°ë¼í¨ë°ë¦¬ í«ì¼,22,2017-11-11T07:16:08.000Z,"ìëë¤ì¤|""ë¹¼ë¹¼ë¡""|""í«ì¼""|""ëëª°ë¼í...",421409,5112,166,459,https://i.ytimg.com/vi/9V8bnWUmE9U/default.jpg,False,False,False,í¼ê°ì¤ë ê¼­ ì¶ì² ë¶íëë ¤ì
3,0_8py-t5R80,17.14.11,"ááµáá§á¼áá¡á¨ ì¶êµ­ íì¥, ëì¹...",ë¯¸ëì´ëª½êµ¬,25,2017-11-12T11:19:52.000Z,"ì´ëªë°|""ì´ëªë° ì¶êµ­ê¸ì§""|""ì´ëªë° ...",222850,2093,173,1219,https://i.ytimg.com/vi/0_8py-t5R80/default.jpg,False,False,False,ë¤ì¤ë ëêµ¬ê²ëê¹ ë£ê³ ë í íì 
4,bk55RbxiQdI,17.14.11,ê¹ì¥ê²¸ì ë¬¼ë¬ê°ë¤ MBC ë¸ì¡° íí¸ì...,NocutV,25,2017-11-13T11:08:59.000Z,"nocutV|""ë¸ì»·V""|""CBS""|""mbc""|""ê¹ì¥ê²¸""|""í´ì...",84466,1094,109,450,https://i.ytimg.com/vi/bk55RbxiQdI/default.jpg,False,False,False,ê¹ì¥ê²¸ MBC ì¬ì¥ì´ ê²°êµ­ í´ìëë¤.\n...


In [70]:
import pandas as pd

# Load the dataset (assume it's a CSV for now)
df = pd.read_csv('KRvideos.csv', encoding='ISO-8859-1')

### 1. Data Integrity Check ###

# Check the basic structure of the dataset
print(df.info())  # Check for missing values and data types
print(df.describe())  # Get descriptive statistics for numerical columns

# Check for unique values in categorical fields
print(df['category_id'].unique())  # Unique categories
print(df['comments_disabled'].unique())  # Ensure comments_disabled is either True/False

### 2. Handling Missing Data ###

# Check for missing data in each column
print(df.isnull().sum())

# Fill missing values (Example: fill missing 'description' with 'No description')
df['description'].fillna('No description', inplace=True)

# Alternatively, drop rows with missing data in important columns like 'views', 'likes'
df.dropna(subset=['views', 'likes', 'dislikes'], inplace=True)

### 3. Duplicate Removal ###

# Check for duplicates
duplicates = df.duplicated()
print(f'Duplicates: {duplicates.sum()}')

# Remove duplicates
df.drop_duplicates(inplace=True)

### 4. Standardization ###

# Standardize 'publish_time' and 'trending_date' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

# Standardize text columns by trimming spaces and lowercasing
df['title'] = df['title'].str.strip().str.lower()
df['channel_title'] = df['channel_title'].str.strip().str.lower()

### 5. Outlier Detection ###

# Check for outliers in numerical columns such as 'views', 'likes', 'dislikes'
import numpy as np

# Calculate the Z-score to identify outliers
from scipy import stats
df['views_zscore'] = np.abs(stats.zscore(df['views']))
df['likes_zscore'] = np.abs(stats.zscore(df['likes']))

# Set a threshold for outliers (e.g., Z-score > 3)
outliers_views = df[df['views_zscore'] > 3]
outliers_likes = df[df['likes_zscore'] > 3]

print(f'Outliers in Views: {outliers_views.shape[0]}')
print(f'Outliers in Likes: {outliers_likes.shape[0]}')

# Optionally remove outliers
df = df[(df['views_zscore'] <= 3) & (df['likes_zscore'] <= 3)]

# Drop the Z-score columns as they are no longer needed
df.drop(columns=['views_zscore', 'likes_zscore'], inplace=True)

### Final Validation ###

# After cleaning, recheck the data
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_dataset_KRvideos.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34567 entries, 0 to 34566
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                34567 non-null  object
 1   trending_date           34567 non-null  object
 2   title                   34567 non-null  object
 3   channel_title           34567 non-null  object
 4   category_id             34567 non-null  int64 
 5   publish_time            34567 non-null  object
 6   tags                    34567 non-null  object
 7   views                   34567 non-null  int64 
 8   likes                   34567 non-null  int64 
 9   dislikes                34567 non-null  int64 
 10  comment_count           34567 non-null  int64 
 11  thumbnail_link          34567 non-null  object
 12  comments_disabled       34567 non-null  bool  
 13  ratings_disabled        34567 non-null  bool  
 14  video_error_or_removed  34567 non-null  bool  
 15  de

In [72]:
df = pd.read_csv('MXvideos.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,SbOwzAl9ZfQ,17.14.11,CapÃ­tulo 12 | MasterChef 2017,MasterChef 2017,24,2017-11-13T06:06:22.000Z,"MasterChef Junior 2017|""TV Azteca""|""recetas""|""...",310130,4182,361,1836,https://i.ytimg.com/vi/SbOwzAl9ZfQ/default.jpg,False,False,False,Disfruta la presencia del Chef Torreblanca en ...
1,klOV6Xh-DnI,17.14.11,ALEXA EX-INTEGRANTE DEL GRUPO TIMBIRICHE RENUN...,Micky Contreras Martinez,22,2017-11-13T05:11:58.000Z,La Voz Mexico 7,104972,271,174,369,https://i.ytimg.com/vi/klOV6Xh-DnI/default.jpg,False,False,False,ALEXA EX-INTEGRANTE DEL GRUPO TIMBIRICHE RENUN...
2,6L2ZF7Qzsbk,17.14.11,LOUIS CKAGÃ - EL PULSO DE LA REPÃBLICA,El Pulso De La RepÃºblica,25,2017-11-13T17:00:02.000Z,"Chumel Torres|""El Pulso de la Republica""|""noti...",136064,10105,266,607,https://i.ytimg.com/vi/6L2ZF7Qzsbk/default.jpg,False,False,False,La canciÃ³n del principio se llama âEste esp...
3,hcY52MFWMDM,17.14.11,Sismo de 6.7 sacude Costa Rica 12 Noviembre 2017,Casanare,25,2017-11-13T03:47:10.000Z,"temblor|""costa rica""|""sismo en costa rica""",96153,378,171,208,https://i.ytimg.com/vi/hcY52MFWMDM/default.jpg,False,False,False,El video es de un Walmart en el pais centroame...
4,_OXDcGPVAa4,17.14.11,DOG HACKS | MUSAS LESSLIE LOS POLINESIOS,Musas,26,2017-11-13T19:17:48.000Z,"MUSAS|""lesslie""|""karen""|""hacks""|""perros""|""dogs...",499965,57781,681,7428,https://i.ytimg.com/vi/_OXDcGPVAa4/default.jpg,False,False,False,MI HERMANO NARRA MI RUTINA DE MAQUILLAJE\nhttp...


In [73]:
import pandas as pd

# Load the dataset (assume it's a CSV for now)
df = pd.read_csv('MXvideos.csv', encoding='ISO-8859-1')

### 1. Data Integrity Check ###

# Check the basic structure of the dataset
print(df.info())  # Check for missing values and data types
print(df.describe())  # Get descriptive statistics for numerical columns

# Check for unique values in categorical fields
print(df['category_id'].unique())  # Unique categories
print(df['comments_disabled'].unique())  # Ensure comments_disabled is either True/False

### 2. Handling Missing Data ###

# Check for missing data in each column
print(df.isnull().sum())

# Fill missing values (Example: fill missing 'description' with 'No description')
df['description'].fillna('No description', inplace=True)

# Alternatively, drop rows with missing data in important columns like 'views', 'likes'
df.dropna(subset=['views', 'likes', 'dislikes'], inplace=True)

### 3. Duplicate Removal ###

# Check for duplicates
duplicates = df.duplicated()
print(f'Duplicates: {duplicates.sum()}')

# Remove duplicates
df.drop_duplicates(inplace=True)

### 4. Standardization ###

# Standardize 'publish_time' and 'trending_date' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

# Standardize text columns by trimming spaces and lowercasing
df['title'] = df['title'].str.strip().str.lower()
df['channel_title'] = df['channel_title'].str.strip().str.lower()

### 5. Outlier Detection ###

# Check for outliers in numerical columns such as 'views', 'likes', 'dislikes'
import numpy as np

# Calculate the Z-score to identify outliers
from scipy import stats
df['views_zscore'] = np.abs(stats.zscore(df['views']))
df['likes_zscore'] = np.abs(stats.zscore(df['likes']))

# Set a threshold for outliers (e.g., Z-score > 3)
outliers_views = df[df['views_zscore'] > 3]
outliers_likes = df[df['likes_zscore'] > 3]

print(f'Outliers in Views: {outliers_views.shape[0]}')
print(f'Outliers in Likes: {outliers_likes.shape[0]}')

# Optionally remove outliers
df = df[(df['views_zscore'] <= 3) & (df['likes_zscore'] <= 3)]

# Drop the Z-score columns as they are no longer needed
df.drop(columns=['views_zscore', 'likes_zscore'], inplace=True)

### Final Validation ###

# After cleaning, recheck the data
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_dataset_MXvideos.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40451 entries, 0 to 40450
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                40451 non-null  object
 1   trending_date           40451 non-null  object
 2   title                   40451 non-null  object
 3   channel_title           40451 non-null  object
 4   category_id             40451 non-null  int64 
 5   publish_time            40451 non-null  object
 6   tags                    40451 non-null  object
 7   views                   40451 non-null  int64 
 8   likes                   40451 non-null  int64 
 9   dislikes                40451 non-null  int64 
 10  comment_count           40451 non-null  int64 
 11  thumbnail_link          40451 non-null  object
 12  comments_disabled       40451 non-null  bool  
 13  ratings_disabled        40451 non-null  bool  
 14  video_error_or_removed  40451 non-null  bool  
 15  de

In [74]:
df = pd.read_csv('RUvideos.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,gDuslQ9avLc,17.14.11,ÐÐ°ÑÐ°Ñ Ð¸ ÐÐ¾Ð»Ð¸Ð½Ð° ÑÑÐ°ÑÑÑ ÑÐºÐ¾...,Ð¢âÐ ÐÐÐÐÐ§,22,2017-11-13T09:09:31.000Z,"Ð·Ð°ÑÐ°Ñ Ð¸ Ð¿Ð¾Ð»Ð¸Ð½Ð°|""ÑÑÐ¸Ð¼ÑÑ ÑÐºÐ...",62408,334,190,50,https://i.ytimg.com/vi/gDuslQ9avLc/default.jpg,False,False,False,"ÐÐ½Ð°ÐºÐ¾Ð¼ÑÑÐµÑÑ, ÑÑÐ¾ ÐÐ°ÑÐ°Ñ Ð¸ Ð..."
1,AOCJIFEA_jE,17.14.11,ÐÐ¸ÑÐ¶Ð° ÐÐµÐ¼Ð¾Ð² #29. ÐÐ¾Ð»ÑÑÐ¾Ð¸Ì Ð²...,Druzhko Show,22,2017-11-13T17:32:11.000Z,"Ð±Ð¸ÑÐ¶Ð° Ð¼ÐµÐ¼Ð¾Ð²|""Ð»ÐµÐ² ÑÐ°Ð³Ð¸Ð½ÑÐ½""|...",330043,43841,2244,2977,https://i.ytimg.com/vi/AOCJIFEA_jE/default.jpg,False,False,False,Ð 29 Ð²ÑÐ¿ÑÑÐºÐµ ÐÑÑÐ¶ÐºÐ¾ Ð¨Ð¾Ñ Ð¡ÐµÑ...
2,VAWNQDgwwOM,17.14.11,Ð¥ÐÐÐ ÐÐ­ÐÐ - Ð¡ÐÐÐ Ð¡Ð ÐÐÐ Ð§Ð£Ð...,Ð®Ð»Ð¸Ðº,24,2017-11-13T16:11:31.000Z,"ÑÐ¼Ð¾Ñ|""ÐºÐ¾Ð¼ÐµÐ´Ð¸Ñ""|""Ð²Ð»Ð¾Ð³""|""Ð±Ð»Ð¾Ð³...",424596,49854,714,2944,https://i.ytimg.com/vi/VAWNQDgwwOM/default.jpg,False,False,False,http://kapitany.ru/!Yulik.cap - Ð¤Ð°ÐºÑÐ»ÑÑ...
3,gknkFwKQfHg,17.14.11,Ð¡Ð¾ÑÐ½Ð°Ñ ÐºÐµÑÐ°Ð´Ð¸Ð»ÑÑ Ñ ÐºÑÑÐ¸ÑÐµÐ¹,Hochland,22,2017-11-13T06:51:10.000Z,"ÑÐ¾ÑÐ»Ð°Ð½Ð´|""ÑÑÑ""|""ÑÐµÑÐµÐ¿ÑÑ""|""ÐºÐ°...",112851,3566,122,80,https://i.ytimg.com/vi/gknkFwKQfHg/default.jpg,False,False,False,*** ÐºÐ°Ðº Ð³Ð¾ÑÐ¾Ð²Ð¸ÑÑ ÑÐµÑÑÐ¾ÑÐ°Ð½Ð½...
4,3sYvZcwzZr8,17.14.11,ÐÐÐÐÐ« Ð ÐÐÐÐ¢ÐÐÐÐ ÐÐ Ð¨ÐÐÐÐ¬Ð...,Ð¡Ð¾Ð²ÐµÑÐ³Ð¾Ð½,24,2017-11-13T16:52:36.000Z,"Ð¡Ð¾Ð²ÐµÑÐ³Ð¾Ð½|""Sovergon""|""ÐºÐ»Ð¸Ð¿Ñ""|""ÑÐ¾...",243469,36216,631,1692,https://i.ytimg.com/vi/3sYvZcwzZr8/default.jpg,False,False,False,â ÐÐÐÐÐ£Ð Ð¡ ÐÐÐÐÐ¢ÐÐÐ«: http://ka...


In [75]:
import pandas as pd

# Load the dataset (assume it's a CSV for now)
df = pd.read_csv('RUvideos.csv', encoding='ISO-8859-1')

### 1. Data Integrity Check ###

# Check the basic structure of the dataset
print(df.info())  # Check for missing values and data types
print(df.describe())  # Get descriptive statistics for numerical columns

# Check for unique values in categorical fields
print(df['category_id'].unique())  # Unique categories
print(df['comments_disabled'].unique())  # Ensure comments_disabled is either True/False

### 2. Handling Missing Data ###

# Check for missing data in each column
print(df.isnull().sum())

# Fill missing values (Example: fill missing 'description' with 'No description')
df['description'].fillna('No description', inplace=True)

# Alternatively, drop rows with missing data in important columns like 'views', 'likes'
df.dropna(subset=['views', 'likes', 'dislikes'], inplace=True)

### 3. Duplicate Removal ###

# Check for duplicates
duplicates = df.duplicated()
print(f'Duplicates: {duplicates.sum()}')

# Remove duplicates
df.drop_duplicates(inplace=True)

### 4. Standardization ###

# Standardize 'publish_time' and 'trending_date' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

# Standardize text columns by trimming spaces and lowercasing
df['title'] = df['title'].str.strip().str.lower()
df['channel_title'] = df['channel_title'].str.strip().str.lower()

### 5. Outlier Detection ###

# Check for outliers in numerical columns such as 'views', 'likes', 'dislikes'
import numpy as np

# Calculate the Z-score to identify outliers
from scipy import stats
df['views_zscore'] = np.abs(stats.zscore(df['views']))
df['likes_zscore'] = np.abs(stats.zscore(df['likes']))

# Set a threshold for outliers (e.g., Z-score > 3)
outliers_views = df[df['views_zscore'] > 3]
outliers_likes = df[df['likes_zscore'] > 3]

print(f'Outliers in Views: {outliers_views.shape[0]}')
print(f'Outliers in Likes: {outliers_likes.shape[0]}')

# Optionally remove outliers
df = df[(df['views_zscore'] <= 3) & (df['likes_zscore'] <= 3)]

# Drop the Z-score columns as they are no longer needed
df.drop(columns=['views_zscore', 'likes_zscore'], inplace=True)

### Final Validation ###

# After cleaning, recheck the data
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_dataset_RUvideos.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40739 entries, 0 to 40738
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                40739 non-null  object
 1   trending_date           40739 non-null  object
 2   title                   40739 non-null  object
 3   channel_title           40739 non-null  object
 4   category_id             40739 non-null  int64 
 5   publish_time            40739 non-null  object
 6   tags                    40739 non-null  object
 7   views                   40739 non-null  int64 
 8   likes                   40739 non-null  int64 
 9   dislikes                40739 non-null  int64 
 10  comment_count           40739 non-null  int64 
 11  thumbnail_link          40739 non-null  object
 12  comments_disabled       40739 non-null  bool  
 13  ratings_disabled        40739 non-null  bool  
 14  video_error_or_removed  40739 non-null  bool  
 15  de

In [76]:
df = pd.read_csv('USvideos.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO â¶ \n\nSUBSCRIBE âº ...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


In [77]:
import pandas as pd

# Load the dataset (assume it's a CSV for now)
df = pd.read_csv('USvideos.csv', encoding='ISO-8859-1')

### 1. Data Integrity Check ###

# Check the basic structure of the dataset
print(df.info())  # Check for missing values and data types
print(df.describe())  # Get descriptive statistics for numerical columns

# Check for unique values in categorical fields
print(df['category_id'].unique())  # Unique categories
print(df['comments_disabled'].unique())  # Ensure comments_disabled is either True/False

### 2. Handling Missing Data ###

# Check for missing data in each column
print(df.isnull().sum())

# Fill missing values (Example: fill missing 'description' with 'No description')
df['description'].fillna('No description', inplace=True)

# Alternatively, drop rows with missing data in important columns like 'views', 'likes'
df.dropna(subset=['views', 'likes', 'dislikes'], inplace=True)

### 3. Duplicate Removal ###

# Check for duplicates
duplicates = df.duplicated()
print(f'Duplicates: {duplicates.sum()}')

# Remove duplicates
df.drop_duplicates(inplace=True)

### 4. Standardization ###

# Standardize 'publish_time' and 'trending_date' to datetime format
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

# Standardize text columns by trimming spaces and lowercasing
df['title'] = df['title'].str.strip().str.lower()
df['channel_title'] = df['channel_title'].str.strip().str.lower()

### 5. Outlier Detection ###

# Check for outliers in numerical columns such as 'views', 'likes', 'dislikes'
import numpy as np

# Calculate the Z-score to identify outliers
from scipy import stats
df['views_zscore'] = np.abs(stats.zscore(df['views']))
df['likes_zscore'] = np.abs(stats.zscore(df['likes']))

# Set a threshold for outliers (e.g., Z-score > 3)
outliers_views = df[df['views_zscore'] > 3]
outliers_likes = df[df['likes_zscore'] > 3]

print(f'Outliers in Views: {outliers_views.shape[0]}')
print(f'Outliers in Likes: {outliers_likes.shape[0]}')

# Optionally remove outliers
df = df[(df['views_zscore'] <= 3) & (df['likes_zscore'] <= 3)]

# Drop the Z-score columns as they are no longer needed
df.drop(columns=['views_zscore', 'likes_zscore'], inplace=True)

### Final Validation ###

# After cleaning, recheck the data
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('cleaned_dataset_USvideos.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                40949 non-null  object
 1   trending_date           40949 non-null  object
 2   title                   40949 non-null  object
 3   channel_title           40949 non-null  object
 4   category_id             40949 non-null  int64 
 5   publish_time            40949 non-null  object
 6   tags                    40949 non-null  object
 7   views                   40949 non-null  int64 
 8   likes                   40949 non-null  int64 
 9   dislikes                40949 non-null  int64 
 10  comment_count           40949 non-null  int64 
 11  thumbnail_link          40949 non-null  object
 12  comments_disabled       40949 non-null  bool  
 13  ratings_disabled        40949 non-null  bool  
 14  video_error_or_removed  40949 non-null  bool  
 15  de