In [1]:
# Importing numpy and pandas
import numpy as np
import pandas as pd

## <center>Pandas Dataframe </center>

In [2]:
# creating pandas dataframe from python dictinoary
scores = {
    "names": ['Rojit', 'Mark', 'Elon', 'Bill', 'Larry', 'Hinton'],
    "city": ['Toronto', 'California', 'Los Angeles', 'NY','DC', 'Toronto'],
    "score": [79, 98, 49, 88, 48, 100],
    "company":["Job less", 'Facebook', 'Tesla', 'Microsoft', 'Google', 'Google'],
    "genius":[False, True, True, False, False, True],
    "profession": ["Programming", 'Business', 'Business', 'Programming', None, 'Programming']
}
df = pd.DataFrame(scores)
df

Unnamed: 0,names,city,score,company,genius,profession
0,Rojit,Toronto,79,Job less,False,Programming
1,Mark,California,98,Facebook,True,Business
2,Elon,Los Angeles,49,Tesla,True,Business
3,Bill,NY,88,Microsoft,False,Programming
4,Larry,DC,48,Google,False,
5,Hinton,Toronto,100,Google,True,Programming


In [3]:
# get only names from dataframe
print(df['names'])
print(type(df['names']))  #  it returns a pandas series

0     Rojit
1      Mark
2      Elon
3      Bill
4     Larry
5    Hinton
Name: names, dtype: object
<class 'pandas.core.series.Series'>


In [4]:
# create new column in pandas dataframe
df['name_city']= df['names']+ ', '+df['city']
df

Unnamed: 0,names,city,score,company,genius,profession,name_city
0,Rojit,Toronto,79,Job less,False,Programming,"Rojit, Toronto"
1,Mark,California,98,Facebook,True,Business,"Mark, California"
2,Elon,Los Angeles,49,Tesla,True,Business,"Elon, Los Angeles"
3,Bill,NY,88,Microsoft,False,Programming,"Bill, NY"
4,Larry,DC,48,Google,False,,"Larry, DC"
5,Hinton,Toronto,100,Google,True,Programming,"Hinton, Toronto"


## <center> Filtering dataframe </center>

In [5]:
# Returning series with score greater than 50 only
df['score'] > 50

0     True
1     True
2    False
3     True
4    False
5     True
Name: score, dtype: bool

In [6]:
condition = df['score'] > 50  # same as previous cell
# passing the same condition to dataframe to filter out the rows which return true
df[condition]

Unnamed: 0,names,city,score,company,genius,profession,name_city
0,Rojit,Toronto,79,Job less,False,Programming,"Rojit, Toronto"
1,Mark,California,98,Facebook,True,Business,"Mark, California"
3,Bill,NY,88,Microsoft,False,Programming,"Bill, NY"
5,Hinton,Toronto,100,Google,True,Programming,"Hinton, Toronto"


##  <center>Basic dataframe methods and attributes</center>

In [7]:
# use head to inspect top rows of the dataframe, pass int arguments or 5 is default
df.head(2)

Unnamed: 0,names,city,score,company,genius,profession,name_city
0,Rojit,Toronto,79,Job less,False,Programming,"Rojit, Toronto"
1,Mark,California,98,Facebook,True,Business,"Mark, California"


In [8]:
# tail to inspect bottom
df.tail(3)

Unnamed: 0,names,city,score,company,genius,profession,name_city
3,Bill,NY,88,Microsoft,False,Programming,"Bill, NY"
4,Larry,DC,48,Google,False,,"Larry, DC"
5,Hinton,Toronto,100,Google,True,Programming,"Hinton, Toronto"


In [9]:
# returns data type of each columns
df.dtypes

names         object
city          object
score          int64
company       object
genius          bool
profession    object
name_city     object
dtype: object

In [10]:
# returns shape of dataframe -> rows, cols
df.shape

(6, 7)

## <center>Subsetting data </center>

In [11]:
# using loc
# returns row from index 2 to 4
df.loc[2:4]

Unnamed: 0,names,city,score,company,genius,profession,name_city
2,Elon,Los Angeles,49,Tesla,True,Business,"Elon, Los Angeles"
3,Bill,NY,88,Microsoft,False,Programming,"Bill, NY"
4,Larry,DC,48,Google,False,,"Larry, DC"


In [12]:
# using key from 0 index to retrive data
df.loc[0,'names'], df.loc[0,'city']

('Rojit', 'Toronto')

In [13]:
# using iloc 
df.iloc[3, 0] , df.iloc[0, 0] 

('Bill', 'Rojit')

## <center>Basic Analysis</center>

#### value_counts()

In [14]:
# Using pandas value_counts()
# Series.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True)
# Returns a pandas series with all the unique values

In [15]:
df['company'].value_counts()

company
Google       2
Job less     1
Facebook     1
Tesla        1
Microsoft    1
Name: count, dtype: int64

In [16]:
print(df['profession'].value_counts())
print("\nUsing dropna=False also returns count with null values \n")
print(df['profession'].value_counts(dropna=False))

profession
Programming    3
Business       2
Name: count, dtype: int64

Using dropna=False also returns count with null values 

profession
Programming    3
Business       2
None           1
Name: count, dtype: int64


#### sort_values()

In [17]:
# DataFrame.sort_values(by, *, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key=None)
# We can sort both series and dataframe


In [18]:
# sorting series
# By default inplace=False so new series is returned
df['names'].sort_values()

3      Bill
2      Elon
5    Hinton
4     Larry
1      Mark
0     Rojit
Name: names, dtype: object

In [19]:
# sorting dataframe
df.sort_values(['company', 'names'], ascending=False)
# df is sorted by company first and then by names

Unnamed: 0,names,city,score,company,genius,profession,name_city
2,Elon,Los Angeles,49,Tesla,True,Business,"Elon, Los Angeles"
3,Bill,NY,88,Microsoft,False,Programming,"Bill, NY"
0,Rojit,Toronto,79,Job less,False,Programming,"Rojit, Toronto"
4,Larry,DC,48,Google,False,,"Larry, DC"
5,Hinton,Toronto,100,Google,True,Programming,"Hinton, Toronto"
1,Mark,California,98,Facebook,True,Business,"Mark, California"
