# Introducing the pandas DataFrame

In [1]:
import pandas as pd

In [2]:
data = {
    'name': ['Xavier', 'Ann', 'Jana', 'Yi', 'Robin', 'Amal', 'Nori'],
    'city': ['Mexico City', 'Toronto', 'Prague', 'Shanghai',
             'Manchester', 'Cairo', 'Osaka'],
    'age': [41, 28, 33, 34, 38, 31, 37],
    'py-score': [88.0, 79.0, 81.0, 80.0, 68.0, 61.0, 84.0]
}

index = range(101, 108)

In [9]:
df = pd.DataFrame(data,index=index)

In [10]:
type(df)

pandas.core.frame.DataFrame

In [11]:
df.index

RangeIndex(start=101, stop=108, step=1)

In [12]:
df.columns

Index(['name', 'city', 'age', 'py-score'], dtype='object')

In [13]:
type(df.columns)

pandas.core.indexes.base.Index

In [14]:
df.head()

Unnamed: 0,name,city,age,py-score
101,Xavier,Mexico City,41,88.0
102,Ann,Toronto,28,79.0
103,Jana,Prague,33,81.0
104,Yi,Shanghai,34,80.0
105,Robin,Manchester,38,68.0


In [15]:
df.tail()

Unnamed: 0,name,city,age,py-score
103,Jana,Prague,33,81.0
104,Yi,Shanghai,34,80.0
105,Robin,Manchester,38,68.0
106,Amal,Cairo,31,61.0
107,Nori,Osaka,37,84.0


In [16]:
df.head(3)

Unnamed: 0,name,city,age,py-score
101,Xavier,Mexico City,41,88.0
102,Ann,Toronto,28,79.0
103,Jana,Prague,33,81.0


In [None]:
df

In [None]:
df['city']

In [None]:
type(df['city'])

In [None]:
cities = df['city']

In [None]:
cities

In [None]:
cities.index

In [None]:
df.age

In [None]:
df['py-score']

In [None]:
df.index

In [None]:
df.loc[103]

In [None]:
type(df.loc[103])

In [None]:
cities

In [None]:
cities[103]

## Creating a pandas DataFrame

In [None]:
import numpy as np

In [None]:
d = {'x': [1,2,3], 'y': np.array([2,4,8]), 'z': 100}

In [None]:
pd.DataFrame(d, index=[100, 200, 300], columns=['z', 'y', 'x'])

In [None]:
lst = [{'x': 1, 'y': 2, 'z': 100},
     {'x': 2, 'y': 4, 'z': 100},
     {'x': 3, 'y': 8, 'z': 100}]

pd.DataFrame(lst, index=['a','b','c'])

In [None]:
lst2 = [[1, 2, 100],
     [2, 4, 100],
     [3, 8, 100]]

pd.DataFrame(lst2, columns=['x', 'y', 'z'])

In [None]:
arr = np.array([[1, 2, 100],
     [2, 4, 100],
     [3, 8, 100]])

df_ = pd.DataFrame(arr, columns=['x','y','z'], copy=True)


In [None]:
df_

In [None]:
arr[1,1] = 33

In [None]:
df_

In [None]:
df

In [None]:
df.to_csv("job_candidates.csv")

In [None]:
pd.read_csv("job_candidates.csv", index_col=0)

In [None]:
df.index

In [None]:
df.index[0]

In [None]:
df.index[1]

In [None]:
df.columns

In [None]:
df.columns[2]

In [None]:
df.index[0] = 100

In [None]:
df.index

In [None]:
df.index = np.arange(10, 17)

In [None]:
df.index

In [None]:
df.values

In [None]:
df.to_numpy()

In [None]:
df.dtypes

In [None]:
df_ = df.astype(dtype={'age': np.int32, 'py-score': np.float32})

In [None]:
df_.dtypes

In [None]:
df.ndim

In [None]:
df.size

In [None]:
df.shape

In [None]:
df_.memory_usage()

## Accessing and Modifying Data

In [None]:
df['name']

In [None]:
df.name

In [None]:
df.age

In [None]:
df.loc[0]

In [None]:
df.index

In [None]:
df.loc[11]

In [None]:
df.loc[:, ['age','py-score']]

In [None]:
df.loc[[x for x in df.index if not x % 2], ['name','city']]

In [None]:
df.iloc[:,[0,2]]

In [None]:
df.iloc[[0, 2], :]

In [None]:
df.loc[12, 'city']

In [None]:
df.at[12, 'city']

In [None]:
df.iat[2, 1]

In [None]:
df

In [None]:
df.loc[:13, 'py-score'] = [40, 50, 60, 70]
df.loc[14:, 'py-score'] = 0

In [None]:
df

In [None]:
df.iloc[:, -1] = np.linspace(20, 50, len(df))

In [None]:
df

In [None]:
old_row = df.loc[16]

In [None]:
df.loc[16] = ['Jack', 'Chicago', 29, 70]
df

In [None]:
df.loc[16] = old_row

In [None]:
df

In [None]:
df.loc[11, 'city'] = 'Ottawa'

In [None]:
df

## Inserting and Deleting Data

In [None]:
df

In [None]:
john = pd.Series(data=['John', 'Boston', 34, 79], index=df.columns, name=17)

In [None]:
df = df.append(john)

In [None]:
df

In [None]:
df.drop(labels=[17], inplace=True)

In [None]:
df

In [None]:
df['js-score'] = [71.0, 95.0, 88.0, 79.0, 91.0, 91.0, 80.0]
df

In [None]:
df['total-score'] = 0.0
df

In [None]:
df.insert(loc=4, column='django-score', value=[71.0, 95.0, 88.0, 79.0, 91.0, 91.0, 80.0])
df

In [None]:
del df['total-score']
df

In [None]:
df['total-score'] = 0.0
df

In [None]:
total_score = df.pop('total-score')
total_score

In [None]:
df

In [None]:
df = df.drop(labels=['age'], axis=1)
df

## Applying Arithmetic Operations

In [None]:
df

In [None]:
df['total'] = 0.3 * df['js-score'] + 0.4 * df['py-score'] + 0.3 * df['django-score']

In [None]:
df

In [None]:
wgts = pd.Series(data=[0.4,0.3,0.3],index=['py-score','django-score','js-score'])
wgts

In [None]:
import numpy as np

In [None]:
df['total'] = np.sum(df[wgts.index] * wgts, axis=1)

In [None]:
df

## Sorting a pandas DataFrame

In [None]:
df

In [None]:
df.sort_values(by=['js-score','py-score'], ascending=[False, False], inplace=True)

In [None]:
df

## Filtering Data

In [None]:
df

In [None]:
filter_ = (df['py-score'] >= 40.0) | (df['js-score'] >= 90)

In [None]:
df[filter_]

In [None]:
df['js-score'].where(cond=df['js-score'] >= 80, other=0.0)

In [None]:
df.filter(regex="score")

## Determining Data Statistics

In [None]:
df

In [None]:
df.describe()

In [None]:
df['py-score'].mean()

In [None]:
df['py-score'].std()

In [None]:
df.mean()

## Handling Missing Data

In [None]:
df_ = pd.DataFrame({'x': [1, 2, np.nan, 4]})
df_

In [None]:
df_.mean()

In [None]:
7/3

In [None]:
df_.mean(skipna=False)

In [None]:
1 + 2 + np.nan

In [None]:
df_.fillna(value=0)

In [None]:
df_.fillna(method='bfill')

In [None]:
df_

In [None]:
df_.interpolate()

In [None]:
df_.dropna(axis=0, inplace=True)

In [None]:
df_

## Iterating Over a Pandas DataFrame

In [None]:
df

In [None]:
for col_label, col in df.iteritems():
    print(col_label, col, sep='\n', end='\n\n')

In [None]:
for row in df.itertuples(index=False, name="JobCandidate"):
    #print(f"Name: {row.name}, City: {row.city}, Total: {row.total}")
    print(row)

## Working With Time Series

In [None]:
temp_c = [8.0,  7.1,  6.8,  6.4,  6.0,  5.4,  4.8,  5.0,
           9.1, 12.8, 15.3, 19.1, 21.2, 22.1, 22.4, 23.1,
          21.0, 17.9, 15.5, 14.4, 11.9, 11.0, 10.2,  9.1]

In [None]:
dt = pd.date_range(start='2019-10-27 00:00:00', periods=24, freq='H')

In [None]:
type(dt)

In [None]:
temp = pd.DataFrame(data={'temp_c': temp_c}, index=dt)

In [None]:
temp

In [None]:
type(temp.index)

In [None]:
temp['2019-10-27 05':'2019-10-27 15']

In [None]:
temp.resample(rule='6h').min()

In [None]:
temp

In [None]:
temp.rolling(window=3, center=True).mean()

## Plotting With pandas DataFrames

In [1]:
temp.plot.line(color='g', style='.-', mfc='k', ms=10).get_figure().savefig('temperatures.png')

NameError: name 'temp' is not defined

In [None]:
df

In [None]:
df['py-score'].plot.hist(bins=3, alpha=0.8)

In [None]:
df[['js-score','total']].plot.hist(bins=3, alpha=0.5)