# Pandas Tutorial: DataFrames in Python
## See: https://www.datacamp.com/community/tutorials/pandas-tutorial-dataframe-python

In [4]:
import numpy as np

# A structured array
my_array = np.ones(3, dtype=([('foo', int), ('bar', float)]))
print(my_array)

print()

# A record array
my_array2 = my_array.view(np.recarray)
print(my_array2.foo)

[(1, 1.) (1, 1.) (1, 1.)]

[1 1 1]


In [5]:
import pandas as pd

## 1. How To Create a Pandas DataFrame

In [7]:
data = np.array([['','Col-1','Col-2'],
                ['Row-1',1,2],
                ['Row-2',3,4]])
                
print(pd.DataFrame(data=data[1:,1:],
                  index=data[1:,0],
                  columns=data[0,1:]))

      Col-1 Col-2
Row-1     1     2
Row-2     3     4


In [10]:
# Take a 2D array as input to your DataFrame 
my_2darray = np.array([[1, 2, 3], [4, 5, 6]])
print(pd.DataFrame(my_2darray))

print()

# Take a dictionary as input to your DataFrame 
my_dict = {1: ['1', '3'], 2: ['1', '2'], 3: ['2', '4']}
print(pd.DataFrame(my_dict))

print()

# Take a DataFrame as input to your DataFrame 
my_df = pd.DataFrame(data=[4,5,6,7], index=range(0,4), columns=['A'])
print(pd.DataFrame(my_df))

print()

# Take a Series as input to your DataFrame
my_series = pd.Series({"United Kingdom":"London", "India":"New Delhi", "United States":"Washington", "Belgium":"Brussels"})
print(pd.DataFrame(my_series))

   0  1  2
0  1  2  3
1  4  5  6

   1  2  3
0  1  1  2
1  3  2  4

   A
0  4
1  5
2  6
3  7

                         0
United Kingdom      London
India            New Delhi
United States   Washington
Belgium           Brussels


In [11]:
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))

# Use the `shape` property
print(df.shape)

# Or use the `len()` function with the `index` property
print(len(df.index))
print(df.index)

(2, 3)
2
RangeIndex(start=0, stop=2, step=1)


## 2. How To Select an Index or Column From a Pandas DataFrame

In [20]:
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=['A','B', 'C'])

# Using `iloc[]`
print(df.iloc[0][0])

# Using `loc[]`
print(df.loc[0]['A'])

# Using `at[]`
print(df.at[0,'A'])

# Using `iat[]`
print(df.iat[0, 0])

1
1
1
1


In [21]:
# Use `iloc[]` to select a row
print(df.iloc[0])

# Use `loc[]` to select a column
print(df.loc[:,'A'])

A    1
B    2
C    3
Name: 0, dtype: int32
0    1
1    4
Name: A, dtype: int32


## 3. How To Add an Index, Row or Column to a Pandas DataFrame

In [22]:
# Print out your DataFrame `df` to check it out
print(df)

# Set 'C' as the index of your DataFrame
df.set_index('C')

   A  B  C
0  1  2  3
1  4  5  6


Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
3,1,2
6,4,5


In [24]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                  index= [2, 'A', 4],
                  columns=[48, 49, 50])

# Pass `2` to `loc`
print(df.loc[2])

# Pass `2` to `iloc`
print(df.iloc[2])

# Pass `2` to `ix`
print(df.ix[2])

48    1
49    2
50    3
Name: 2, dtype: int32
48    7
49    8
50    9
Name: 4, dtype: int32
48    7
49    8
50    9
Name: 4, dtype: int32


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':


In [25]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), index= [2.5, 12.6, 4.8], columns=[48, 49, 50])

# There's no index labeled `2`, so you will change the index at position `2`
df.ix[2] = [60, 50, 40]
print(df)

# This will make an index labeled `2` and add the new values
df.loc[2] = [11, 12, 13]
print(df)

      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8   60  50  40
      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8   60  50  40
2.0   11  12  13


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


In [26]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['A', 'B', 'C'])

# Use `.index`
df['D'] = df.index

# Print `df`
print(df)

   A  B  C  D
0  1  2  3  0
1  4  5  6  1
2  7  8  9  2


In [30]:
# Study the DataFrame `df`
print(df)

# Append a column to `df`
df.loc[:, 4] = pd.Series(['5', '6', '7'], index=df.index)

# Print out `df` again to see the changes
print(df)

   A  B  C  D
0  1  2  3  0
1  4  5  6  1
2  7  8  9  2
   A  B  C  D  4
0  1  2  3  0  5
1  4  5  6  1  6
2  7  8  9  2  7


In [31]:
# Check out the weird index of your dataframe
print(df)

# Use `reset_index()` to reset the values
df_reset = df.reset_index(level=0, drop=True)

# Print `df_reset`
print(df_reset)

   A  B  C  D  4
0  1  2  3  0  5
1  4  5  6  1  6
2  7  8  9  2  7
   A  B  C  D  4
0  1  2  3  0  5
1  4  5  6  1  6
2  7  8  9  2  7


## 4. How to Delete Indices, Rows or Columns From a Pandas Data Frame

In [32]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [40, 50, 60], [23, 35, 37]]), 
                  index= [2.5, 12.6, 4.8, 4.8, 2.5], 
                  columns=[48, 49, 50])
                  
df.reset_index().drop_duplicates(subset='index', keep='last').set_index('index')

Unnamed: 0_level_0,48,49,50
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12.6,4,5,6
4.8,40,50,60
2.5,23,35,37


In [38]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [40, 50, 60], [23, 35, 37]]), 
                  index= [2.5, 12.6, 4.8, 4.8, 2.5], 
                  columns=['A', 'B', 'C'])

# Check out the DataFrame `df`
print(df)

# Drop the column with label 'A'                  
df.drop('A', axis=1, inplace=True)

# Drop the column at position 1
df.drop(df.columns[[1]], axis=1)

       A   B   C
2.5    1   2   3
12.6   4   5   6
4.8    7   8   9
4.8   40  50  60
2.5   23  35  37


Unnamed: 0,B
2.5,2
12.6,5
4.8,8
4.8,50
2.5,35


In [43]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [1, 5, 6], [7, 8, 9], [40, 50, 60], [1, 35, 37]]), 
                  index= [2.5, 12.6, 4.8, 4.8, 2.5], 
                  columns=[48, 49, 50])

# Check out your DataFrame `df`
print(df)

# Drop the duplicates in `df`
df.drop_duplicates([48], keep='last')

      48  49  50
2.5    1   2   3
12.6   1   5   6
4.8    7   8   9
4.8   40  50  60
2.5    1  35  37


Unnamed: 0,48,49,50
4.8,7,8,9
4.8,40,50,60
2.5,1,35,37


In [46]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [40, 50, 60], [23, 35, 37]]), 
                  index= [2.5, 12.6, 4.8, 4.8, 2.5], 
                  columns=['A', 'B', 'C'])

# Check out the DataFrame `df`
print(df)

# Drop the index at position 1
print(df.drop(df.index[1]))

       A   B   C
2.5    1   2   3
12.6   4   5   6
4.8    7   8   9
4.8   40  50  60
2.5   23  35  37
      A   B   C
2.5   1   2   3
4.8   7   8   9
4.8  40  50  60
2.5  23  35  37


## 5. How to Rename the Index or Columns of a Pandas DataFrame

In [61]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [40, 50, 60], [23, 35, 37]]), 
                  index= [2.5, 12.6, 4.8, 4.8, 2.5], 
                  columns=['A', 'B', 'C'])

# Check out your DataFrame `df`
print(df)

# Define the new names of your columns
newcols = {
    'A': 'new_column_1', 
    'B': 'new_column_2', 
    'C': 'new_column_3'
}

# Use `rename()` to rename your columns
df.rename(columns=newcols, inplace=True)

# Rename your index
df.rename(index={1: 'a'})

       A   B   C
2.5    1   2   3
12.6   4   5   6
4.8    7   8   9
4.8   40  50  60
2.5   23  35  37


Unnamed: 0,new_column_1,new_column_2,new_column_3
2.5,1,2,3
12.6,4,5,6
4.8,7,8,9
4.8,40,50,60
2.5,23,35,37


## 6. How To Format The Data in Your Pandas DataFrame

In [62]:
df = pd.DataFrame(data=np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]]), 
                  index= [2.5, 12.6, 4.8, 4.8, 2.5], 
                  columns=['Awful', 'Poor', 'OK', 'Acceptable', 'Perfect'])

# Study the DataFrame `df` first
print(df)

# Replace the strings by numerical values (0-4)
df.replace(['Awful', 'Poor', 'OK', 'Acceptable', 'Perfect'], [0, 1, 2, 3, 4]) 

ValueError: Shape of passed values is (3, 5), indices imply (5, 5)