# Data Analysis with PANDAS

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.DataFrame({
    "Anime":
    [
        "One Piece", "Naruto", "Bleach", "Gintama"
    ],
    "Episodes":
    [
        1014, 720, 366, 366
    ]
})

df

Unnamed: 0,Anime,Episodes
0,One Piece,1014
1,Naruto,720
2,Bleach,366
3,Gintama,366


In [6]:
# ANSI escape codes for bold text and resetting formatting
BOLD = "\033[1m"
RESET = "\033[0m"

In [7]:
print(f"\t'df' is of the type : {BOLD}{type(df)}{RESET}")

	'df' is of the type : [1m<class 'pandas.core.frame.DataFrame'>[0m


### Adding new Column in Pandas DataFrame

In [8]:
df['Main Character'] = ['Monkey D Luffy', 'Naruto', 'Ichigo', 'Gintoki']

df

Unnamed: 0,Anime,Episodes,Main Character
0,One Piece,1014,Monkey D Luffy
1,Naruto,720,Naruto
2,Bleach,366,Ichigo
3,Gintama,366,Gintoki


In [9]:
df['Check'] = 1

df

Unnamed: 0,Anime,Episodes,Main Character,Check
0,One Piece,1014,Monkey D Luffy,1
1,Naruto,720,Naruto,1
2,Bleach,366,Ichigo,1
3,Gintama,366,Gintoki,1


### Dot Notation and Square Notation to Represent Columns

In [12]:
# Square Notation ..
df['Anime']

0    One Piece
1       Naruto
2       Bleach
3      Gintama
Name: Anime, dtype: object

In [13]:
# Dot Notation ..
df.Anime

0    One Piece
1       Naruto
2       Bleach
3      Gintama
Name: Anime, dtype: object

In [14]:
df[['Main Character', 'Anime']]

Unnamed: 0,Main Character,Anime
0,Monkey D Luffy,One Piece
1,Naruto,Naruto
2,Ichigo,Bleach
3,Gintoki,Gintama


### Modifying Index to DataFrame

In [16]:
df = pd.DataFrame({
   "Anime":
    [
        "One Piece","Naruto","Bleach","Gintama"
    ],
    "Episodes":
    [
        994,720,366,366
    ]
}, index=['a','b','c','d'])  #modifying index


df

Unnamed: 0,Anime,Episodes
a,One Piece,994
b,Naruto,720
c,Bleach,366
d,Gintama,366


In [17]:
# Set column as index using 'set_index' method ..

df.set_index('Episodes')

Unnamed: 0_level_0,Anime
Episodes,Unnamed: 1_level_1
994,One Piece
720,Naruto
366,Bleach
366,Gintama


### Create a Pandas Series
Unlike DataFrame, A Series is one dimensional representation of the data. You can either pass a one dimensional ndarray or a list to represent a Series.

In [18]:
df = pd.Series(['Good', 'Bad', 'Neutral'], index = ['a', 'b', 'c'])

df

a       Good
b        Bad
c    Neutral
dtype: object

In [19]:
df_2 = pd.Series(np.random.rand(10), index = list(range(1, 11)))

df_2

1     0.553381
2     0.111331
3     0.599963
4     0.375732
5     0.066581
6     0.691861
7     0.773520
8     0.569885
9     0.229309
10    0.937833
dtype: float64

### Reading CSV File

In [21]:
data = pd.read_csv("naruto_analysis.csv")

In [23]:
data.head()

Unnamed: 0,Arc names,Total Episodes
0,Prologue — Land of Waves Arc,19
1,Chūnin Exams Arc,48
2,Konoha Crush Arc,13
3,Search for Tsunade Arc,20
4,Land of Tea Escort Mission Arc,6


In [24]:
data.head(3)     # 'n' represents number of rows ..

Unnamed: 0,Arc names,Total Episodes
0,Prologue — Land of Waves Arc,19
1,Chūnin Exams Arc,48
2,Konoha Crush Arc,13


In [25]:
data

Unnamed: 0,Arc names,Total Episodes
0,Prologue — Land of Waves Arc,19
1,Chūnin Exams Arc,48
2,Konoha Crush Arc,13
3,Search for Tsunade Arc,20
4,Land of Tea Escort Mission Arc,6
5,Sasuke Recovery Mission Arc,29
6,Filler Arcs Arc,85
7,Kakashi Gaiden Arc,0
8,Special,17
9,Kazekage Rescue Mission Arc,32


In [26]:
data.tail()

Unnamed: 0,Arc names,Total Episodes
32,Childhood Arc,4
33,Sasuke Shinden: Book of Sunrise Arc,5
34,Shikamaru Hiden: A Cloud Drifting in Silent Da...,5
35,Konoha Hiden: The Perfect Day for a Wedding Arc,7
36,The Seventh Hokage and the Scarlet Spring Arc,0


In [27]:
data.tail(3)

Unnamed: 0,Arc names,Total Episodes
34,Shikamaru Hiden: A Cloud Drifting in Silent Da...,5
35,Konoha Hiden: The Perfect Day for a Wedding Arc,7
36,The Seventh Hokage and the Scarlet Spring Arc,0


In [31]:
data.head(-2)     # we can use head(-m) to get total_rows - m records of the dataframe ..

Unnamed: 0,Arc names,Total Episodes
0,Prologue — Land of Waves Arc,19
1,Chūnin Exams Arc,48
2,Konoha Crush Arc,13
3,Search for Tsunade Arc,20
4,Land of Tea Escort Mission Arc,6
5,Sasuke Recovery Mission Arc,29
6,Filler Arcs Arc,85
7,Kakashi Gaiden Arc,0
8,Special,17
9,Kazekage Rescue Mission Arc,32


In [32]:
data.sample()     # by default only one random row ..

Unnamed: 0,Arc names,Total Episodes
29,Jiraiya Shinobi Handbook: The Tale of Naruto t...,19


In [33]:
data.sample(4)

Unnamed: 0,Arc names,Total Episodes
17,Six-Tails Unleashed Arc,8
30,Kaguya Ōtsutsuki Strikes Arc,23
21,Fourth Shinobi World War: Countdown Arc,22
2,Konoha Crush Arc,13


### Info & Describe
Info displays a report on the data index range, data column, dtype and memory usage. The data column consists of Column Name, Non-Null count and dtype.

The describe() method returns description of the numeric data in the DataFrame. describe() gives an overview on data count, mean, median, quartile range, min and max values in the given data.

In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Arc names       37 non-null     object
 1   Total Episodes  37 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 724.0+ bytes


In [42]:
data.describe()     # works only on numeric data ..

Unnamed: 0,Total Episodes
count,37.0
mean,19.459459
std,16.485001
min,0.0
25%,7.0
50%,18.0
75%,22.0
max,85.0


# *Working on bigger dataset ..

In [37]:
dataset = pd.read_csv('data.csv')

In [38]:
dataset.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [39]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [41]:
dataset.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [43]:
dataset.shape

(8807, 12)

In [44]:
dataset.isnull()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,False,False,False,False,True,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False,False,False,False
3,False,False,False,True,True,True,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,False,False,False,False,False,False,False,False,False,False,False,False
8803,False,False,False,True,True,True,False,False,False,False,False,False
8804,False,False,False,False,False,False,False,False,False,False,False,False
8805,False,False,False,False,False,False,False,False,False,False,False,False


In [45]:
dataset.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64