First let's import the python libraries we'll be using:

In [1]:
import pandas as pd
import pandasql as ps
import numpy

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Then we'll need to read in the first csv file. We'll call the dataframe 'df_oldstats' since this file contains the data on 2023's trending YouTubers:

In [2]:
df_oldstats = pd.read_csv('old_datadump/GlobalYTstats.csv')

df_oldstats.head()

Unnamed: 0,rank,Youtuber,subscribers,video views,Category,Title,uploads,Country,Abbreviation,channel_type,...,subscribers_for_last_30_days,created_year,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude
0,1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288
1,2,YouTube Movies,170000000,0.0,Film & Animation,youtubemovies,1,United States,US,Games,...,,2006.0,Mar,5.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
2,3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
4,5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288


In the next few steps, I'll be showing you exactly how many rows & columns of data there are and what rows & columns we can drop in this first dataset:

In [3]:
cols = len(df_oldstats.axes[1])
rows = len(df_oldstats.axes[0])
print("Number of columns before cleaning data: ", cols)
print("Number of rows before cleaning data: ", rows)

df_oldstats.columns.values

Number of columns before cleaning data:  28
Number of rows before cleaning data:  995


array(['rank', 'Youtuber', 'subscribers', 'video views', 'Category',
       'Title', 'uploads', 'Country', 'Abbreviation', 'channel_type',
       'video_views_rank', 'country_rank', 'channel_type_rank',
       'video_views_for_the_last_30_days', 'lowest_monthly_earnings',
       'highest_monthly_earnings', 'lowest_yearly_earnings',
       'highest_yearly_earnings', 'subscribers_for_last_30_days',
       'created_year', 'created_month', 'created_date',
       'Gross tertiary education enrollment (%)', 'Population',
       'Unemployment rate', 'Urban_population', 'Latitude', 'Longitude'],
      dtype=object)

In [4]:
#narrow down to only necessary columns
df_oldstats = df_oldstats.drop(columns = ['Title', 'uploads', 'Abbreviation', 'video_views_rank', 'country_rank', 'channel_type_rank', 
                                          'video_views_for_the_last_30_days', 'lowest_monthly_earnings', 'highest_monthly_earnings', 
                                          'lowest_yearly_earnings', 'highest_yearly_earnings', 'subscribers_for_last_30_days', 'created_year', 
                                          'created_month', 'created_date', 'Gross tertiary education enrollment (%)', 'Population', 
                                          'Unemployment rate', 'Urban_population', 'Latitude', 'Longitude'])

#drop rows to shorten data to the top 50 youtubers
df_oldstats.drop(df_oldstats.index[range(54, 995)], axis=0, inplace=True)

#remove rows where zero and null values are since our focus is on singular content creators, not a broad category
df_oldstats = df_oldstats.drop(df_oldstats[df_oldstats['video views'] == 0].index)

#reset index and drop old one
df_oldstats = df_oldstats.reset_index(drop=True)

#displaying cleaned up dataset along with narrowed down cols and rows
cols = len(df_oldstats.axes[1])
rows = len(df_oldstats.axes[0])
print("Number of columns after cleaning data: ", cols)
print("Number of rows after cleaning data: ", rows)

#set the dataframe options to suppress scientifc notation by setting float_format and then display ALL cols and rows
pd.options.display.float_format = '{:.0f}'.format
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df_oldstats.head(None)

Number of columns after cleaning data:  7
Number of rows after cleaning data:  50


Unnamed: 0,rank,Youtuber,subscribers,video views,Category,Country,channel_type
0,1,T-Series,245000000,228000000000,Music,India,Music
1,3,MrBeast,166000000,28368841870,Entertainment,United States,Entertainment
2,4,Cocomelon - Nursery Rhymes,162000000,164000000000,Education,United States,Education
3,5,SET India,159000000,148000000000,Shows,India,Entertainment
4,7,Kids Diana Show,112000000,93247040539,People & Blogs,United States,Entertainment
5,8,PewDiePie,111000000,29058044447,Gaming,Japan,Entertainment
6,9,Like Nastya,106000000,90479060027,People & Blogs,Russia,People
7,10,Vlad and Niki,98900000,77180169894,Entertainment,United States,Entertainment
8,11,Zee Music Company,96700000,57856289381,Music,India,Music
9,12,WWE,96000000,77428473662,Sports,United States,Sports


In [5]:
#creating a boolean to find null values
null_q = df_oldstats.isnull().any(axis=1)
null_rows = df_oldstats[null_q]
null_rows.head(None)

Unnamed: 0,rank,Youtuber,subscribers,video views,Category,Country,channel_type
11,15,Goldmines,86900000,24118230580,Film & Animation,,Music
34,39,LooLoo Kids - Nursery Rhymes and Children's Songs,54000000,32312431239,Music,,
44,49,Badabun,46800000,19398045702,Entertainment,,Music
47,52,Bad Bunny,46100000,30686342319,Music,,Sports


In [6]:
#researched and filled null value errors
df_oldstats.at[11, 'Country'] = 'India'
df_oldstats.at[34, 'Country'] = 'United States'
df_oldstats.at[34, 'channel_type'] = 'Education'
df_oldstats.at[44, 'Country'] = 'Mexico'
df_oldstats.at[47, 'Country'] = 'Puerto Rico'

#now let's see if any null values still exists
df_oldstats.isnull().values.any()

False

In [7]:
#now let's create a new rank for the top 50 youtubers
df_oldstats['New Rank'] = df_oldstats['rank'].rank()

df_oldstats.head(5)

Unnamed: 0,rank,Youtuber,subscribers,video views,Category,Country,channel_type,New Rank
0,1,T-Series,245000000,228000000000,Music,India,Music,1
1,3,MrBeast,166000000,28368841870,Entertainment,United States,Entertainment,2
2,4,Cocomelon - Nursery Rhymes,162000000,164000000000,Education,United States,Education,3
3,5,SET India,159000000,148000000000,Shows,India,Entertainment,4
4,7,Kids Diana Show,112000000,93247040539,People & Blogs,United States,Entertainment,5


In [8]:
#drop old rank column and reorder new rank column
df_oldstats = df_oldstats.drop(columns = ['rank'])
df_oldstats = df_oldstats.iloc[:,[6,0,1,2,3,4,5]]
df_oldstats.head(5)

Unnamed: 0,New Rank,Youtuber,subscribers,video views,Category,Country,channel_type
0,1,T-Series,245000000,228000000000,Music,India,Music
1,2,MrBeast,166000000,28368841870,Entertainment,United States,Entertainment
2,3,Cocomelon - Nursery Rhymes,162000000,164000000000,Education,United States,Education
3,4,SET India,159000000,148000000000,Shows,India,Entertainment
4,5,Kids Diana Show,112000000,93247040539,People & Blogs,United States,Entertainment
