From https://kgptalkie.com/pandas-crash-course/

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path as path
import json

In [None]:
with open('./variables.json') as f:
  variables_dict = json.load(f)
  dataset_path = variables_dict['dataset_path']

In [None]:
nba_dataset_directory = path(dataset_path,'nba.csv')
imdb_movie_dataset_directory = path(dataset_path,'IMDB-Movie-Data.csv')

In [None]:
df = pd.read_csv(nba_dataset_directory)
df.head(10)

In [None]:
df.tail(2)

In [None]:
df = pd.read_csv(nba_dataset_directory, index_col='Name')
df.head()

In [None]:
df = pd.read_csv(imdb_movie_dataset_directory, index_col='Rank')
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.shape

Checking for duplicates:

In [None]:
print(df.duplicated(),'\n')
# or check the sum
print('Sum is: ', sum(df.duplicated()))

In [None]:
df1 = df.append(df)
df1.shape

Now we have duplicated the data hence:

In [None]:
df1.duplicated()

In [None]:
df1.shape

In [None]:
sum(df1.duplicated())

In [None]:
df2 = df1.drop_duplicates()
# dropping duplicated data
df2.shape

In [None]:
df1.shape

We can drop duplicates in place, so we do not need to create a new dataframe

In [None]:
df1.drop_duplicates(inplace=True)
df1.shape

In [None]:
# print the names of the columns of the dataframe
df.columns

In [None]:
df.columns.size

In [None]:
df.describe()

In [None]:
col = df.columns

In [None]:
col

In [None]:
col1 = ['a','b','c','d','e','f','g','h','i','j','k']
df.columns = col1
df.head()

In [None]:
df.columns = col
df.head(0)

In [None]:
df.rename(columns={
    'Runtime (Minutes)': 'Minutes',
    'Revenue (Millions)': 'Revenue'
}, inplace=True)
df.columns

In [None]:
# to compare
col

In [None]:
df.isnull().head()

In [None]:
# finding empty cells in the dataframe
df.isnull().sum()

In [None]:
# or
df.isna().sum()

In [None]:
df1 = df.dropna()
df1.shape

We can use df.dropna(inplace=True) if we want to drop NA values directly in the dataframe instead of creating a copy

In [None]:
df.columns

In [None]:
df2 = df.dropna(axis=1)
df2.head(3)

Revenue and Metascore columns dropped

Imputation: A process of replacing missing data with substitute values

In [None]:
df3 = df.fillna(0)
df3.isna().sum()

In [None]:
df.isnull().sum()

In [None]:
revenue = df.Revenue
type(revenue)

In [None]:
revenue.tail()

In [None]:
revenue_mean = revenue.mean()
revenue_mean

In [None]:
revenue.fillna(revenue_mean, inplace=True)
revenue.tail()

In [None]:
revenue.isnull().sum()

In [None]:
df.Revenue = revenue
df.isnull().sum()

In [None]:
metascore = df.Metascore
metascore_mean = metascore.mean()
print('metascore_mean = ', metascore_mean)
metascore.fillna(metascore_mean, inplace=True)
df.Metascore = metascore
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.Genre.describe()

In [None]:
df.Genre.value_counts().head(10)

In [None]:
df.Genre.unique().size

corrmat method --> Creates a correlation matrix

In [None]:
corrmat = df.corr()
corrmat

In [None]:
sns.heatmap(corrmat)

In [None]:
df.plot(kind = 'scatter', x = 'Rating', y = 'Revenue', title = 'Revenue vs Rating')

In [None]:
df.Rating.plot(kind = 'hist', title = 'Rating')

In [None]:
df.Rating.plot(kind = 'kde', title = 'Rating')

In [None]:
df.Rating.value_counts()

In [None]:
df.Rating.plot(kind = 'box')

In [None]:
df.Rating.describe()

In [None]:
rating_cat = []
for rate in df.Rating:
    if rate > 6.2:
        rating_cat.append('Good')
    else:
        rating_cat.append('Bad')
rating_cat[:20]

In [None]:
df['Rating Category'] = rating_cat
df.head(5)

In [None]:
df.boxplot(column='Revenue', by='Rating Category')