### Getting familiarized with Pandas

In [None]:
import pandas as pd

In [None]:
data = {
    'apples': [300, 200, 0, 100], 
    'oranges': [0, 300, 700, 200]
}
data

In [None]:
df = pd.DataFrame(data)
df

In [None]:
df = pd.DataFrame(data, index = ["Q1", "Q2", "Q3", "Q4"])
df

In [None]:
df.info()

In [None]:
df1 = df.reset_index()
df1

In [None]:
df1.set_index("index", inplace=True)
df1

In [None]:
df1.index = ["q1", "q2", "q3", "q4"]
df1

In [None]:
df1.loc["q1":"q3"] # Data frame slicing

In [None]:
df1.iloc[0:3]

### Data Cleaning

In [None]:
data = {
    'col1': [1, 2, 3, 'data 3'], 
    'col2': [1.1, 2.2, 'data -3.2', 4.4], 
    'col3': ['a', 'b', 'c', 'd']
}
df = pd.DataFrame({'col1': [1, 2, 3, 'data 3'], 'col2': [1.1, 2.2, 'data -3.2', 4.4], 'col3': ['a', 'b', 'c', 'd']})

In [None]:
# use the astype() method to convert the 'col1' column to integers
print(df['col1'].astype(int, errors='ignore')) #  errors options: {‘raise’, ‘ignore’}, default 'raise'

print(pd.to_numeric(df['col2'], errors='coerce')) # errors options: {‘ignore’, ‘raise’, ‘coerce’}, default ‘raise’

try:
    df['col1'] = df['col1'].astype(float, errors='raise')
except: 
    print('Exception raised by astype')


# display the updated DataFrame
print(df.dtypes)
df["col3"]=df['col3'].astype("string")
print(df.dtypes)
#print(df.convert_dtypes().dtypes)

In [None]:
df1 = df.copy()
print(df1)
df1=df1.astype("string")
df1["col1"]=df1["col1"].str.extract("([-+]?\d*\.?\d+)")
df1["col2"]=df1["col2"].str.extract("([-+]?\d*\.?\d+)")
df1

In [None]:
import re #Regex or regular expression library
df1 = df.copy()
print(df1)
df1=df1.astype("string")
extractAfloat = lambda s:re.findall("([-+]?\d*\.?\d+)",s)[0]
df1["col1"]=df1["col1"].apply(extractAfloat)
df1["col2"]=df1["col2"].apply(extractAfloat)
df1

In [None]:
df = pd.read_csv("data.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df1 = df.sort_values("Duration")
df1

In [None]:
df1.head()

In [None]:
df1.reset_index()

In [None]:
df1.reset_index(drop=True)

In [None]:
df1

In [None]:
df1.reset_index(drop=True, inplace=True)
df1

In [None]:
df1[["Duration", "Pulse"]]

In [None]:
df1[df1.Duration==20]

In [None]:
df1

In [None]:
df1 = df1.reset_index()
df1

In [None]:
df1=df1.drop(["level_0","index"], axis=1)

In [None]:
df1.head()

In [None]:
df1.rename(columns={"Duration": "Seconds"}, inplace=True)

In [None]:
df1.head()

In [None]:
df1["Pulse"]>100

In [None]:
df1["AtRisk"] = df1["Pulse"]>100
df1.head()

In [None]:
df1[df1["Pulse"]>100]

In [None]:
df1["Seconds"].unique()

In [None]:
df1.replace({
    "Seconds":{
        15: "Short",
        300: "Long"
    }
})

In [None]:
df1.head()

In [None]:
df1.info()

In [None]:
df1.isna()

In [None]:
df1.isna().any(axis=1) # Rows whose any column has a NA value

In [None]:
df1.isna().any(axis=0) #Columns whose any row has a non Zero value 

In [None]:
df1.isna().any() #Default: Columns whose any row has a non Zero value 

In [None]:
df1.dropna()

In [None]:
df1.info()

In [None]:
df1.Calories.mode()[0]

In [None]:
df1.Calories.median()

In [None]:
movie_df=pd.read_csv('imdb_movies.csv')

In [None]:
movie_df.info()

In [None]:
movie_df.head()

In [None]:
movie_df.set_index("names")

In [None]:
movies_df = pd.read_csv("imdb_movies.csv", index_col="names")
movies_df["date_x"].astype("datetime64[ns]")

In [None]:
movies_df = pd.read_csv("imdb_movies.csv", index_col="names",parse_dates=["date_x"])
movies_df.info()

In [None]:
movies_df["country"].unique()

In [None]:
indian_movie_df = movies_df[movies_df["country"]=="IN"]

In [None]:
indian_movie_df

In [None]:
movie_df.head()

In [None]:
movie_df.isna().any(axis=0)

In [None]:
movie_df.isna().any(axis=1).sum()

In [None]:
indian_movie_df.isna().any(axis=0)

In [None]:
indian_movie_df.isna().any(axis=1)

In [None]:
indian_movie_df.isna().sum()

In [None]:
indian_movie_df.info()

In [None]:
indian_movie_df.dropna().info()

In [None]:
indian_movie_df.dropna(axis=1).info()

In [None]:
indian_movie_df.head()

In [None]:
indian_movie_df.describe()

In [None]:
indian_movie_df['genre'].value_counts() #value_counts can tell us the frequency of all values in a column:

In [None]:
indian_movie_df['score'].value_counts()

In [None]:
indian_movie_df["year"] = indian_movie_df["date_x"].dt.year # year, month, week, day, date, weekday, day_name(), month_name()

In [None]:
indian_movie_df["date_x"].dt.day_name()

In [None]:
indian_movie_df["rating"] = indian_movie_df["score"].apply(lambda s : "good" if s>70 else "bad")

In [None]:
indian_movie_df