In [10]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# -------------------------------
# 1. Load a Dataset Using NumPy
# -------------------------------
# Provide the correct path to the CSV file in Google Drive
file_path = '/content/drive/My Drive/data.csv'

# Load a CSV file. Adjust the path as needed.
df =pd.read_csv(file_path)

# Explore DataFrame attributes
print("Shape:", df.shape)
print("Columns:", df.columns)
print("Data Types:\n", df.dtypes)
print("First 5 rows:\n", df.head())
print("Last 5 rows:\n", df.tail())
print("Info:\n", df.info())
print("Describe:\n", df.describe())

# Selecting columns
name_column = df['Car']
occupation_salary_columns = df[['Model', 'Volume']]

# Selecting rows
first_row = df.iloc[0]
first_five_rows = df[0:5]
rows_label_0_to_4 = df.loc[0:4]
rows_iloc_0_to_4 = df.iloc[0:5]

print("\nSelected Columns:")
print(name_column)
print(occupation_salary_columns)

print("\nSelected Rows:")
print("First row:\n", first_row)
print("First five rows:\n", first_five_rows)
print("Rows 0 to 4 (loc):\n", rows_label_0_to_4)
print("Rows 0 to 4 (iloc):\n", rows_iloc_0_to_4)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Shape: (36, 5)
Columns: Index(['Car', 'Model', 'Volume', 'Weight', 'CO2'], dtype='object')
Data Types:
 Car       object
Model     object
Volume     int64
Weight     int64
CO2        int64
dtype: object
First 5 rows:
           Car       Model  Volume  Weight  CO2
0      Toyoty        Aygo    1000     790   99
1  Mitsubishi  Space Star    1200    1160   95
2       Skoda      Citigo    1000     929   95
3        Fiat         500     900     865   90
4        Mini      Cooper    1500    1140  105
Last 5 rows:
          Car   Model  Volume  Weight  CO2
31     Volvo    XC70    2000    1746  117
32      Ford   B-Max    1600    1235  104
33       BMW     216    1600    1390  108
34      Opel  Zafira    1600    1405  109
35  Mercedes     SLK    2500    1395  120
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 5 columns):
 # 

In [11]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# -------------------------------
# 1. Load a Dataset Using NumPy
# -------------------------------
# Provide the correct path to the CSV file in Google Drive
file_path = '/content/drive/My Drive/data.csv'

# Load a CSV file. Adjust the path as needed.
df =pd.read_csv(file_path)
#  Data Cleaning and Preparation

# 1. Identify missing values using isnull() and isna()
print("Missing values using isnull():\n", df.isnull().sum())
print("Missing values using isna():\n", df.isna().sum())

# 2. Handle missing values

# Fill missing values with a constant (e.g., 0)
df_filled_constant = df.fillna(0)

# Fill missing values with the mean of the column (replace 'column_name' with actual column)
# Ensure the column you want to fill with mean is numerical
if 'column_name' in df.columns:  # Replace 'column_name' with the actual column
    df['column_name'] = df['column_name'].fillna(df['column_name'].mean())

# Drop rows with any missing values
df_dropped = df.dropna()

# Interpolate missing values (linear interpolation for numerical data)
df_interpolated = df.interpolate()

# 3. Scaling numerical columns (Min-Max and Z-score scaling)

# Selecting numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Apply Min-Max scaling (scales data to range [0, 1])
scaler_min_max = MinMaxScaler()
df_min_max_scaled = df.copy()
df_min_max_scaled[numerical_cols] = scaler_min_max.fit_transform(df[numerical_cols])
print("Data after Min-Max Scaling:\n", df_min_max_scaled.head())

# Apply Z-score scaling (Standardization: mean=0, std=1)
scaler_zscore = StandardScaler()
df_zscore_scaled = df.copy()
df_zscore_scaled[numerical_cols] = scaler_zscore.fit_transform(df[numerical_cols])
print("Data after Z-score Scaling:\n", df_zscore_scaled.head())

# 4. Create dummy variables for categorical columns

# Identifying categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Creating dummy variables (one-hot encoding)
df_with_dummies = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print("Data after creating dummy variables:\n", df_with_dummies.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Missing values using isnull():
 Car       0
Model     0
Volume    0
Weight    0
CO2       0
dtype: int64
Missing values using isna():
 Car       0
Model     0
Volume    0
Weight    0
CO2       0
dtype: int64
Data after Min-Max Scaling:
           Car       Model  Volume    Weight       CO2
0      Toyoty        Aygo  0.0625  0.000000  0.300000
1  Mitsubishi  Space Star  0.1875  0.387029  0.166667
2       Skoda      Citigo  0.0625  0.145397  0.166667
3        Fiat         500  0.0000  0.078452  0.000000
4        Mini      Cooper  0.3750  0.366109  0.500000
Data after Z-score Scaling:
           Car       Model    Volume    Weight       CO2
0      Toyoty        Aygo -1.593366 -2.103893 -0.411925
1  Mitsubishi  Space Star -1.071901 -0.554072 -0.956120
2       Skoda      Citigo -1.593366 -1.521663 -0.956120
3        Fiat         500 -1.854099 -1.789740 -1.636364
4

  df_interpolated = df.interpolate()


In [12]:
import pandas as pd

# Sample data to work with
data = {
    'Category': ['A', 'B', 'A', 'C', 'B', 'C', 'A', 'C'],
    'SubCategory': ['X', 'Y', 'X', 'Z', 'Y', 'Z', 'X', 'Z'],
    'Sales': [200, 150, 300, 400, 500, 250, 100, 300],
    'Profit': [20, 30, 50, 60, 70, 10, 20, 15],
    'Quantity': [1, 2, 3, 4, 5, 2, 1, 2]
}
df = pd.DataFrame(data)

# Aggregation and Grouping

# 1. Calculate summary statistics using groupby

# Group by 'Category' and calculate mean, median, count
grouped = df.groupby('Category').agg({
    'Sales': ['mean', 'median', 'count'],
    'Profit': ['sum', 'mean'],
    'Quantity': 'sum'
})

print("Grouped Summary Statistics:\n", grouped)

# 2. Create pivot tables for data summarization
# Create a pivot table summarizing 'Sales' and 'Profit' by 'Category' and 'SubCategory'
pivot_table = pd.pivot_table(df, values=['Sales', 'Profit'],
                             index=['Category'],
                             columns=['SubCategory'],
                             aggfunc='sum',
                             fill_value=0)

print("\nPivot Table:\n", pivot_table)

# 3. Combine DataFrames using concat, merge, and join

# Create a second DataFrame to demonstrate merging
data2 = {
    'Category': ['A', 'B', 'C', 'D'],
    'Region': ['North', 'South', 'East', 'West']
}
df2 = pd.DataFrame(data2)

# Concatenation (vertically stacking two DataFrames)
df_concat = pd.concat([df, df2], axis=0, ignore_index=True)
print("\nConcatenated DataFrame:\n", df_concat)

# Merge DataFrames using different join types

# Inner join
df_inner = pd.merge(df, df2, on='Category', how='inner')
print("\nInner Join:\n", df_inner)

# Outer join
df_outer = pd.merge(df, df2, on='Category', how='outer')
print("\nOuter Join:\n", df_outer)

# Left join
df_left = pd.merge(df, df2, on='Category', how='left')
print("\nLeft Join:\n", df_left)

# Right join
df_right = pd.merge(df, df2, on='Category', how='right')
print("\nRight Join:\n", df_right)

# Joining using 'join' function (joining on indices)
df_joined = df.set_index('Category').join(df2.set_index('Category'), how='left')
print("\nJoin on Indices:\n", df_joined)


Grouped Summary Statistics:
                Sales              Profit            Quantity
                mean median count    sum       mean      sum
Category                                                    
A         200.000000  200.0     3     90  30.000000        5
B         325.000000  325.0     2    100  50.000000        7
C         316.666667  300.0     3     85  28.333333        8

Pivot Table:
             Profit          Sales          
SubCategory      X    Y   Z     X    Y    Z
Category                                   
A               90    0   0   600    0    0
B                0  100   0     0  650    0
C                0    0  85     0    0  950

Concatenated DataFrame:
    Category SubCategory  Sales  Profit  Quantity Region
0         A           X  200.0    20.0       1.0    NaN
1         B           Y  150.0    30.0       2.0    NaN
2         A           X  300.0    50.0       3.0    NaN
3         C           Z  400.0    60.0       4.0    NaN
4         B         