### Getting familiarized with Pandas

In [1]:
import pandas as pd

In [2]:
data = {
    'apples': [300, 200, 0, 100], 
    'oranges': [0, 300, 700, 200]
}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,apples,oranges
0,300,0
1,200,300
2,0,700
3,100,200


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   apples   4 non-null      int64
 1   oranges  4 non-null      int64
dtypes: int64(2)
memory usage: 196.0 bytes


In [5]:
df.dtypes

apples     int64
oranges    int64
dtype: object

In [6]:
df["apples"]

0    300
1    200
2      0
3    100
Name: apples, dtype: int64

In [8]:
df = pd.DataFrame(data, index = ["Q1", "Q2", "Q3", "Q4"])
df

Unnamed: 0,apples,oranges
Q1,300,0
Q2,200,300
alo,0,700
Q4,100,200


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Q1 to Q4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   apples   4 non-null      int64
 1   oranges  4 non-null      int64
dtypes: int64(2)
memory usage: 96.0+ bytes


In [10]:
df1 = df.reset_index()
df1

Unnamed: 0,index,apples,oranges
0,Q1,300,0
1,Q2,200,300
2,alo,0,700
3,Q4,100,200


In [11]:
df1.set_index("index", inplace=True)
df1

Unnamed: 0_level_0,apples,oranges
index,Unnamed: 1_level_1,Unnamed: 2_level_1
Q1,300,0
Q2,200,300
alo,0,700
Q4,100,200


In [12]:
df1.index = ["q1", "q2", "q3", "q4"]
df1

Unnamed: 0,apples,oranges
q1,300,0
q2,200,300
q3,0,700
q4,100,200


- `.loc[]` -> Helps to access a group of rows and columns by label(s) or a boolean array.
- `.iloc[]` -> Helps to access a group of rows and columns by integer(s) or a boolean array.
- Arguments to pass for both are same:
    1. A single label, e.g. 5 or 'a', (Note that 5 is interpreted as a label of the index. This use is not an integer position along the index.).
    2. A list or array of labels, e.g. ['a', 'b', 'c'].
    3. A slice object with labels, e.g. 'a':'f'.
    4. A boolean array of the same length as the axis being sliced, e.g. [True, False, True].
    5. A callable function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above).

In [13]:
df1.loc["q1":"q3"] # Data frame slicing

Unnamed: 0,apples,oranges
q1,300,0
q2,200,300
q3,0,700


In [14]:
df1.iloc[0:3]

Unnamed: 0,apples,oranges
q1,300,0
q2,200,300
q3,0,700


In [15]:
alphabets = pd.read_csv("./english-letter.csv")
alphabets.head() # head() shows the first 5 rows of the data

Unnamed: 0,letter,frequency
0,A,0.08167
1,B,0.01492
2,C,0.02782
3,D,0.04253
4,E,0.12702


In [17]:
alphabets.columns # returns the column names

Index(['letter', 'frequency'], dtype='object')

In [16]:
alphabets.shape # You may independently try len(df.index) to get the number of rows and len(df.columns) to get the number of columns.

(26, 2)

In [18]:
alphabets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   letter     26 non-null     object 
 1   frequency  26 non-null     float64
dtypes: float64(1), object(1)
memory usage: 548.0+ bytes


In [19]:
alphabets.dtypes

letter        object
frequency    float64
dtype: object

In [20]:
alphabets["frequency"].head(5)

0    0.08167
1    0.01492
2    0.02782
3    0.04253
4    0.12702
Name: frequency, dtype: float64

In [21]:
alphabets = pd.read_csv("english-letter.csv", index_col="letter")
alphabets

Unnamed: 0_level_0,frequency
letter,Unnamed: 1_level_1
A,0.08167
B,0.01492
C,0.02782
D,0.04253
E,0.12702
F,0.02288
G,0.02015
H,0.06094
I,0.06966
J,0.00153


In [22]:
vowels = ["A", "E", "I", "O", "U"]
alphabets.loc[vowels]

Unnamed: 0_level_0,frequency
letter,Unnamed: 1_level_1
A,0.08167
E,0.12702
I,0.06966
O,0.07507
U,0.02758


In [None]:
alphabets.sort_values("frequency", ascending=False) # sort by frequency in descending order 

### Data Cleaning

In [1]:
import pandas as pd
data = {
    'col1': [1, 2, 3, '4 is the data'], 
    'col2': [1.1, 2.2, 'data: -3.2', 4.4], 
}
df = pd.DataFrame(data)
df

Unnamed: 0,col1,col2
0,1,1.1
1,2,2.2
2,3,data: -3.2
3,4 is the data,4.4


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    4 non-null      object
 1   col2    4 non-null      object
dtypes: object(2)
memory usage: 196.0+ bytes


In [3]:
x = type(df["col1"][0]) # type of the first element in col1
print(x)
type(df["col2"][0]) # type of the first element in col2

<class 'int'>


float

In [4]:
df.astype("string").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    4 non-null      string
 1   col2    4 non-null      string
dtypes: string(2)
memory usage: 196.0 bytes


- use the astype() method to convert the 'col1' column to integers
- use the to_numeric() method to convert the 'col2' column to floats
- use the to_numeric() method to convert the 'col3' column to floats, but use the errors='coerce' option to convert any non-numeric values to NaN
- use the to_numeric() method to convert the 'col4' column to floats, but use the errors='ignore' option to leave any non-numeric values as they are
- use the to_numeric() method to convert the 'col5' column to floats, but use the errors='raise' option to raise an exception if any non-numeric values are found


print(df['col1'].astype(int, errors='ignore')) #  errors options: {'raise', 'ignore'}, default 'raise'

print(pd.to_numeric(df['col2'], errors='coerce')) # errors options: {‘ignore’, ‘raise’, ‘coerce’}, default ‘raise’

try:
    df['col1'] = df['col1'].astype(float, errors='raise')
except: 
    print('Exception raised by astype')

In [6]:
#astype convert col1 to integer
# df['col1'] = df['col1'].astype(int)

print(pd.to_numeric(df['col2'], errors='coerce')) # errors options: {‘ignore’, ‘raise’, ‘coerce’}, default ‘raise’


0    1.1
1    2.2
2    NaN
3    4.4
Name: col2, dtype: float64


In [7]:
df = df.astype("string")
df

Unnamed: 0,col1,col2
0,1,1.1
1,2,2.2
2,3,data: -3.2
3,4 is the data,4.4


filter function is used to filter the elements of a sequence based on a function.
filter(function, sequence)
filter(function, sequence) returns a sequence consisting of those items from the sequence for which function(item) is true. If sequence is a str, unicode or tuple, the result will be of the same type; otherwise, it is always a list.

In [8]:
#!pip install regrex
import re
df["col1"].apply(lambda s: re.findall("[+-]?\d*\.?\d+",s)[0]) 
# apply takes a function and applies it to every row of the dataframe
# lambda is a function that takes a string and returns a list of numbers
# re.findall returns a list of all the numbers in the string
# [0] takes the first number in the list

0    1
1    2
2    3
3    4
Name: col1, dtype: object

In [9]:
df1 = df.copy()
print(df1)
df1=df1.astype("string")
df1["col1"]=df1["col1"].str.extract("([-+]?\d*\.?\d+)")
df1["col2"]=df1["col2"].str.extract("([-+]?\d*\.?\d+)")
df1

            col1        col2
0              1         1.1
1              2         2.2
2              3  data: -3.2
3  4 is the data         4.4


Unnamed: 0,col1,col2
0,1,1.1
1,2,2.2
2,3,-3.2
3,4,4.4


import re  
s = 'data: -30.2 N'  
re.findall ("\d", s)  
re.findall(("\d*"), s)  
re.findall(("\d+"), s)  
re.findall(("\d+\.?"),s)  
re.findall(("\d+\. ?\d+"), s)  
re.findall (("[-+]\d+\. ?\d+"), s)  
re.findall (("[-+]\d*\ . ?\d+"), s)  
re.findall(("[-+]\d*\ . ?\d+"), s)[0]  

In [10]:
import re #Regex or regular expression library
df1 = df.copy()
print(df1)
df1=df1.astype("string")
extractAfloat = lambda s:re.findall("([-+]?\d*\.?\d+)",s)[0]
df1["col1"]=df1["col1"].apply(extractAfloat)
df1["col2"]=df1["col2"].apply(extractAfloat)
df1

            col1        col2
0              1         1.1
1              2         2.2
2              3  data: -3.2
3  4 is the data         4.4


Unnamed: 0,col1,col2
0,1,1.1
1,2,2.2
2,3,-3.2
3,4,4.4


In [11]:
df = pd.read_csv("data.csv")

In [12]:
df.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB


In [None]:
df.describe()

In [None]:
df1 = df.sort_values("Duration")
df1

In [None]:
df1.reset_index()

In [None]:
df1.reset_index(drop=True)

In [None]:
df1

In [None]:
df1.reset_index(drop=True, inplace=True)
df1

In [None]:
df1[["Duration", "Pulse"]]

In [None]:
df1[df1.Duration==20]

In [None]:
df1

In [None]:
df1 = df1.reset_index()
df1

In [None]:
df1=df1.drop(["level_0","index"], axis=1)

In [None]:
df1.head()

In [None]:
df1.rename(columns={"Duration": "Seconds"}, inplace=True)

In [None]:
df1.head()

In [None]:
df1["Pulse"]>100

In [None]:
df1["AtRisk"] = df1["Pulse"]>100
df1.head()

In [None]:
df1[df1["Pulse"]>100]

In [None]:
df1["Seconds"].unique()

In [None]:
df1.replace({
    "Seconds":{
        15: "Short",
        300: "Long"
    }
})

In [None]:
df1.head()

In [None]:
df1.info()

In [None]:
df1.isna()

In [None]:
df1.isna().any(axis=1) # Rows whose any column has a NA value

In [None]:
df1.isna().any(axis=0) #Columns whose any row has a non Zero value 

In [None]:
df1.isna().any() #Default: Columns whose any row has a non Zero value 

In [None]:
df1.dropna()

In [None]:
df1.info()

In [None]:
df1.Calories.mode()[0]

In [None]:
df1.Calories.median()

In [None]:
movie_df=pd.read_csv('imdb_movies.csv')

In [None]:
movie_df.info()

In [None]:
movie_df.head()

In [None]:
movie_df.set_index("names")

In [None]:
movies_df = pd.read_csv("imdb_movies.csv", index_col="names")
movies_df["date_x"].astype("datetime64[ns]")

In [None]:
movies_df = pd.read_csv("imdb_movies.csv", index_col="names",parse_dates=["date_x"])
movies_df.info()

In [None]:
movies_df["country"].unique()

In [None]:
indian_movie_df = movies_df[movies_df["country"]=="IN"]

In [None]:
indian_movie_df

In [None]:
movie_df.head()

In [None]:
movie_df.isna().any(axis=0)

In [None]:
movie_df.isna().any(axis=1).sum()

In [None]:
indian_movie_df.isna().any(axis=0)

In [None]:
indian_movie_df.isna().any(axis=1)

In [None]:
indian_movie_df.isna().sum()

In [None]:
indian_movie_df.info()

In [None]:
indian_movie_df.dropna().info()

In [None]:
indian_movie_df.dropna(axis=1).info()

In [None]:
indian_movie_df.head()

In [None]:
indian_movie_df.describe()

In [None]:
indian_movie_df['genre'].value_counts() #value_counts can tell us the frequency of all values in a column:

In [None]:
indian_movie_df['score'].value_counts()

In [None]:
indian_movie_df["year"] = indian_movie_df["date_x"].dt.year # year, month, week, day, date, weekday, day_name(), month_name()

In [None]:
indian_movie_df["date_x"].dt.day_name()

In [None]:
indian_movie_df["rating"] = indian_movie_df["score"].apply(lambda s : "good" if s>70 else "bad")

In [None]:
indian_movie_df