## Working with DataFrames

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("x1.csv")

### Peek at DataFrame Structure

##### Index and datatypes (total no of rows and datatypes of each column)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 7 columns):
Unnamed: 0    52 non-null object
c0            52 non-null float64
c1            52 non-null float64
c2            52 non-null float64
c3            52 non-null float64
c4            52 non-null float64
groupable     52 non-null object
dtypes: float64(5), object(2)
memory usage: 2.9+ KB


##### summary & stats for each column

In [8]:
df.describe()

Unnamed: 0,c0,c1,c2,c3,c4
count,52.0,52.0,52.0,52.0,52.0
mean,0.014553,-0.138174,0.03911,0.109155,-0.221412
std,1.132582,0.98906,0.969464,1.094207,1.047459
min,-3.002764,-2.261887,-2.398951,-3.227313,-3.149114
25%,-0.801529,-0.857035,-0.501361,-0.577483,-0.854859
50%,0.114479,-0.252286,0.089164,0.158602,-0.185503
75%,0.828127,0.605523,0.596721,0.833329,0.409378
max,2.195786,1.921311,2.280235,2.268919,1.944759


#### Selection 

In [9]:
top_left_df = df.iloc[:4,:4]

In [10]:
top_left_df

Unnamed: 0.1,Unnamed: 0,c0,c1,c2
0,A,-1.255331,-0.445616,-2.398951
1,B,-0.08605,-0.513851,-0.107019
2,C,0.047551,0.786886,0.723076
3,D,0.386498,-0.052349,-0.261739


In [11]:
bottom_right_df = df.iloc[-4:,-4:]

In [12]:
bottom_right_df

Unnamed: 0,c2,c3,c4,groupable
48,-1.66361,0.832867,-3.149114,b
49,0.996165,0.803867,-1.783907,b
50,0.564321,-0.567338,-1.325157,b
51,1.081013,-0.607918,0.100003,b


### DataFrame Utility Methods

##### Copy a dataframe

In [15]:
df_c = df.copy()

#### Compute numerical data ranks (1 through n) along axis. Equal values are assigned a rank that is the average of the ranks of those values

In [17]:
df = df.rank() 

##### Sort values by column

In [22]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,c0,c1,c2,c3,c4,groupable
33,34.0,1.0,35.0,35.0,15.0,10.0,38.5
44,45.0,2.0,20.0,52.0,42.0,25.0,38.5


In [23]:
df = df.sort_values('c1')

In [24]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,c0,c1,c2,c3,c4,groupable
51,52.0,11.0,1.0,48.0,13.0,31.0,14.5
23,24.0,14.0,2.0,10.0,48.0,48.0,27.5


##### Sort values by index

In [25]:
df = df.sort_index()

In [26]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,c0,c1,c2,c3,c4,groupable
0,1.0,6.0,24.0,1.0,28.0,22.0,38.5
1,2.0,24.0,22.0,20.0,5.0,49.0,27.5
