In [12]:
import pandas as pd
import numpy as np

- #### Pandas DataFrames are two-dimensional data structures with labeled rows and columns, that can hold many data types.

In [13]:
# Sample dataframe
df = pd.DataFrame({
    'A': [1,2,3],
    'B': [4,5,6],
    'C': [7,8,9]
})
df

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


## Axes :---

In [14]:
# Understanding Axes
df.sum() # it sums down 0 axis (rows)
df.sum(axis=0) # same as before, sums down 0 axis (cuz axis=0 is default)
df.sum(axis=1) # sums across the 1 axis (columns)

0    12
1    15
2    18
dtype: int64

## Loading Data into DF :---

In [15]:
# Loading data into DF
df = pd.read_csv('username-or-email.csv')

# limit which rows are read when reading in a file 
pd.read_csv('username-or-email.csv', nrows=3) # it will only read first 3 rows
pd.read_csv('username-or-email.csv', skiprows=[1,4]) # it will skip 1st row(i.e. row-index=0) & 4th row(i.e. row-index=3)

# randomly sample a dataframe
duffer_1 = df.sample(frac=0.62, random_state=1) # it will contain 62% of the rows
print(duffer_1)
# this sample() randomly picks rows from df, like sufflinf a deck of cards
# And Without random_state=1 we'll get differetn rows everytime but with it we'll get same rows every time
duffer_2 = df[~df.index.isin(duffer_1.index)] # it will contain the other 38% of the rows
print(duffer_2)

# changing the maximum number of rows and columns printed
pd.set_option(max_columns,None) # 'None' means 'unlimited'
# default is 60 rows

pd.set_option(max_columns,None)
# default is 60 columns

# Reset options to defaults
pd.reset_option(max_rows)
pd.reset_option(max_columns)

  Username;Login email;Identifier;First name;Last name
2  johnson81;craig@yourcompany.com;4081;Craig;Joh...  
1       grey07;laura@yourcompany.com;2070;Laura;Grey  
4         smith79;jamie@example.com;5079;Jamie;Smith  
  Username;Login email;Identifier;First name;Last name
0     booker12;rachel@example.com;9012;Rachel;Booker  
3       jenkins46;mary@example.com;9346;Mary;Jenkins  


NameError: name 'max_columns' is not defined

## Create DataFrame :---

In [None]:
# Creating a dictionary of pandas series
items = {
    "Miku": pd.Series(data=[100, 33, 181], index=["Watches", "shoes", "books"]),
    "Milan": pd.Series(data=[43, 22, 90], index=["toys", "shoes", "bats"]),
    "Nikita": pd.Series(data=[90, 28, 187], index=["makeup-kit", "sarees", "books"]),
}

print(type(items))  # <class 'dict'>

# creating a pandas DataFrame by passing it a dictionary of Series
amazon_cart = pd.DataFrame(items)

# creating dataframe that only has a subset of the data/colums
shoes_shopping_cart = pd.DataFrame(
    items, columns=["Milan"]
)  # will show everything of 'Milan' column only

# creating a dataframe that only has selected keys
shopping_cart = pd.DataFrame(
    items, index=["toys", "makeup-kit"]
)  # this will show only 'toys' & 'makeup-kit' index/row for all columns

# combining both of the above - i.e. selected keys for selected columns
Miku_Nikita_books_shoes_collection = pd.DataFrame(
    items, columns=["Miku", "Nikita"], index=["books", "shoes"]
)

# Creating DataFrames from a dictionary of lists (arrays)
# In this case, however, all the lists (arrays) in the dictionary must be of the same length

# Creating a dictioonary of lists (arrays)
data = {"Integers": [1, 2, 3], "Floats": [1.2, 3.4, 5.6]}

# Now creating a dataframe using the above lists
df = pd.DataFrame(data)

# creating a DataFrame & providing the row index for each row/key
df = pd.DataFrame(data, index=["nums-1", "nums-2", "nums-3"])

# Creating DataFrames from a list of Python dictionaris
# CReating a list of python dictionaries
items2 = [
    {"t-shirts": 5, "full-shirts": 2, "watches": 3},
    {"bikes": 0, "goggles": 6, "watches": 2},
]

# Creating a DataFrame
store_items2 = pd.DataFrame(items2)


# Creating a DataFrame and providing th row index
store_items2 = pd.DataFrame(items2, index=['store-1','store-2'])

<class 'dict'>
   t-shirts  full-shirts  watches  bikes  goggles
0       5.0          2.0        3    NaN      NaN
1       NaN          NaN        2    0.0      6.0


## Create df from Series, dicts :---

In [None]:
# creaing dictionary from a bunch of Series/data
books = pd.Series(
    data=[
        "half-girlfriend",
        "Of Mice and Men",
        "Romeo and Juliet",
        "The Time Machine",
        "Alice in Wonderland",
    ]
)
authors = pd.Series(
    data=[
        "Chetan Bhagat",
        "John Steinbeck",
        "William Shakespeare",
        " H. G. Wells",
        "Lewis Carroll",
    ]
)

user_1 = pd.Series(data=[1.4, 4.5])
user_2 = pd.Series(data=[3.5, 9.1, 1.1, 9.0])
user_3 = pd.Series(data=[1.4, 5, np.nan, 4.2])
user_4 = pd.Series(data=[4, 3.5, 6, 8])

a_dict = {'Author':authors, 'Book Title':books, 'User_1':user_1, 'User_2':user_2, 'User_3':user_3, 'User_4':user_4}

# Use the dictionary to create a Pandas DataFrame
book_ratings = pd.DataFrame(a_dict)

# convert to numpy array (remove the column names, get just the values to convert it into a numpy array)
book_ratings_numpy = book_ratings.values
book_ratings_numpy

In [None]:
# CReating a DataFrame from a dictionary
pd.DataFrame(
    {
        "column_x": ["value-x1", "value-x2", "value-x3"],
        "column_y": ["value-y1", "value-y2", "value-y3"],
    }
)

# creatign a DataFrame from a list of lists
pd.DataFrame([
    ['value_x1','value_y1'],
    ['value_x2','value_y2'],
    ['value_x3','value_y3']
])

## Accessing Elements :---

In [None]:
# Accessing elements :---
# Accesing via col-label
print('watches in each store:- \n',store_items2[['watches']])
print('\nt-shirts & goggles in each store:- \n',store_items2[['t-shirts','goggles']])

# Accessing via row-label
print('\nItems in store-1:-\n',store_items2.loc[['store-1']])

# Accessing via both row & col label
print('\nfull-shirts in store-1:',store_items2['full-shirts']['store-1'])

# while accessing individual elements in a dataframe, the labels should always be provided with column label first, then row label 
# i.e. in the form dataframe[column][row]

## Modify Elements :---

In [39]:
# Modifying Elements
# Adding new column (adds it to the end of the DataFrame)
store_items2["full-shirts"] = [15, 20]

# Adding new column
# Via arithmetic operations between columns
store_items2["leggings"] = store_items2["full-shirts"] + store_items2["t-shirts"]

# Adding new Row
# by creating a new df then applying it to the original df
# creatign a new dictionary froma list of python dicionaries
new_items = [{"slippers": 18, "laptops": 5, "dumbell-plates": 8}]

# Now creating a new df wih new items and providing an index label 'store-3'
new_store = pd.DataFrame(new_items, index=["store-3"])

# Now i'll concat store-3 to our store_items2 DataFrame
store_items2 = pd.concat([store_items2, new_store])
store_items2

Unnamed: 0,t-shirts,full-shirts,watches,bikes,goggles,leggings,slippers,laptops,dumbell-plates
store-1,5.0,15.0,3.0,,,20.0,,,
store-2,,20.0,2.0,0.0,6.0,,,,
store-3,,,,,,,18.0,5.0,8.0
