In [None]:
import pandas as pd

In [None]:
#  Creating DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, 30, 35, 40],
    "Salary": [50000, 60000, 70000, 80000]
}
df = pd.DataFrame(data)
print("DataFrame:\n", df)

DataFrame:
       Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000
3    David   40   80000


In [None]:
## Upload csv file
#  Read & Write
df = pd.read_csv("california_housing_test.csv") # Reading CSV

# You could save it to a new file like this (optional):
df.to_csv("output.csv", index=False) # Writing CSV


In [None]:
# ---- Basic Info ----
print("--- Basic Info ---")
print("Head:\n", df.head(2))          # First 2 rows
print("\nTail:\n", df.tail(2))          # Last 2 rows
print("\nShape:", df.shape)             # (Rows, Columns)
print("\nColumns:", df.columns.tolist())# List of column names
print("\n--- Info ---")
df.info()                               # Summary of the DataFrame
print("\n--- Description ---")
print(df.describe())                    # Statistical summary

--- Basic Info ---
Head:
    longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0         6.6085            344700.0  
1       809.0       277.0         3.5990            176500.0  

Tail:
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
2998    -117.12     34.10                40.0         96.0            14.0   
2999    -119.63     34.42                42.0       1765.0           263.0   

      population  households  median_income  median_house_value  
2998        46.0        14.0         3.2708            162500.0  
2999       753.0       260.0         8.5608            500001.0  

Shape: (3000, 9)

Columns: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'populati

In [None]:
# ---- Selection & Indexing ----
print("\n--- Selection & Indexing ---")
# Select a single column (e.g., total_rooms)
print("\nSingle Column (total_rooms):\n", df["total_rooms"].head())

# Select multiple columns
print("\nMultiple Columns:\n", df[["housing_median_age", "median_house_value"]].head())

# Select a row by its integer position (e.g., the second row)
print("\nRow by Index (iloc[1]):\n", df.iloc[1])

# Select a row by its index label (e.g., the row with index 2)
print("\nRow by Label (loc[2]):\n", df.loc[2])

# Select a specific cell [row=2, column='median_house_value']
print("\nCell [row=2, col='median_house_value']:", df.loc[2, "median_house_value"])



--- Selection & Indexing ---

Single Column (total_rooms):
 0    3885.0
1    1510.0
2    3589.0
3      67.0
4    1241.0
Name: total_rooms, dtype: float64

Multiple Columns:
    housing_median_age  median_house_value
0                45.0            344700.0
1                43.0            176500.0
2                27.0            270500.0
3                28.0            330000.0
4                19.0             81700.0

Row by Index (iloc[1]):
 longitude               -118.300
latitude                  34.260
housing_median_age        43.000
total_rooms             1510.000
total_bedrooms           310.000
population               809.000
households               277.000
median_income              3.599
median_house_value    176500.000
Name: 1, dtype: float64

Row by Label (loc[2]):
 longitude               -117.8100
latitude                  33.7800
housing_median_age        27.0000
total_rooms             3589.0000
total_bedrooms           507.0000
population              1484.00

In [None]:
#  Filtering
# Filter for houses older than 50 years
print("\nHouses with median_age > 50:\n", df[df["housing_median_age"] > 50].head())

# Filter for houses with a value between $150k and $200k
value_filter = (df["median_house_value"] >= 150000) & (df["median_house_value"] <= 200000)
print("\nHouses with value between 150k & 200k:\n", df[value_filter].head())



Houses with median_age > 50:
      longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
26     -122.42     37.76                52.0       3587.0          1030.0   
36     -122.53     37.97                52.0       1560.0           451.0   
70     -121.92     37.33                52.0       2125.0           382.0   
99     -118.20     33.77                52.0       1375.0           457.0   
109    -122.43     37.74                52.0       1514.0           314.0   

     population  households  median_income  median_house_value  
26       2259.0       979.0         2.5403            250000.0  
36        700.0       419.0         2.5125            270800.0  
70        930.0       387.0         5.2831            299500.0  
99       1089.0       317.0         2.2344            200000.0  
109       724.0       301.0         5.3292            300900.0  

Houses with value between 150k & 200k:
     longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \

In [None]:
#  Adding & Modifying
# Let's change the 'housing_median_age' for the first row (index 0) to 45
df.loc[0, "housing_median_age"] = 45
print("\nModified DataFrame (new column and changed value):\n", df.head())


Modified DataFrame (new column and changed value):
    longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                45.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   
2    -117.81     33.78                27.0       3589.0           507.0   
3    -118.36     33.82                28.0         67.0            15.0   
4    -119.67     36.33                19.0       1241.0           244.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0         6.6085            344700.0  
1       809.0       277.0         3.5990            176500.0  
2      1484.0       495.0         5.7934            270500.0  
3        49.0        11.0         6.1359            330000.0  
4       850.0       237.0         2.9375             81700.0  


In [None]:
#  Sorting
# Sort by population in ascending order
print("\nSort by Population:\n", df.sort_values(by="population").head())

# Sort by median house value in descending order
print("\nSort by Median House Value (Descending):\n", df.sort_values(by="median_house_value", ascending=False).head())



Sort by Population:
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
2640    -114.62     33.62                26.0         18.0             3.0   
740     -117.12     32.66                52.0         16.0             4.0   
1115    -116.95     33.86                 1.0          6.0             2.0   
1355    -117.11     32.66                52.0         25.0             5.0   
2494    -118.44     34.04                49.0         32.0             7.0   

      population  households  median_income  median_house_value  
2640         5.0         3.0         0.5360            275000.0  
740          8.0         3.0         1.1250             60000.0  
1115         8.0         2.0         1.6250             55000.0  
1355        14.0         9.0         1.6250            118800.0  
2494        14.0         7.0         2.1875            225000.0  

Sort by Median House Value (Descending):
       longitude  latitude  housing_median_age  total_rooms  total_bedr

In [None]:
#  GroupBy & Aggregation
grouped = df.groupby("total_rooms")["median_house_value"].mean()
print("\nAverage house value by total rooms:\n", grouped)

# Calculate the median population for each 'total_rooms' category
agg_funcs = {
    'population': 'median',
    'median_house_value': 'mean'
}
grouped_agg = df.groupby('total_rooms').agg(agg_funcs)
print("\nAggregate\n", grouped_agg)



Average house value by total rooms:
 total_rooms
6.0         55000.0
16.0        60000.0
18.0       275000.0
19.0       162500.0
21.0       175000.0
             ...   
21988.0    191100.0
23915.0    244900.0
24121.0    239300.0
27870.0    212200.0
30450.0    174300.0
Name: median_house_value, Length: 2215, dtype: float64

Aggregate
              population  median_house_value
total_rooms                                
6.0                 8.0             55000.0
16.0                8.0             60000.0
18.0                5.0            275000.0
19.0              166.0            162500.0
21.0               21.0            175000.0
...                 ...                 ...
21988.0          8824.0            191100.0
23915.0         10877.0            244900.0
24121.0          4176.0            239300.0
27870.0         11935.0            212200.0
30450.0          9419.0            174300.0

[2215 rows x 2 columns]


In [None]:
#  Handling Missing Values: computational errors, biased results, and poor model performance.
df2 = pd.DataFrame({
    "Name": ["Eve", "Frank", None],
    "Age": [28, None, 40],
    "Salary": [55000, 62000, None]
})
print("Original DataFrame with Missing Values:\n", df2)

# Drop rows with any missing values
print("\nDrop NaN:\n", df2.dropna())

# Fill missing values with specified defaults
fill_values = {"Age": df2["Age"].median(), "Salary": df2["Salary"].mean(), "Name": "Unknown"}
print("\nFill NaN:\n", df2.fillna(value=fill_values))


Original DataFrame with Missing Values:
     Name   Age   Salary
0    Eve  28.0  55000.0
1  Frank   NaN  62000.0
2   None  40.0      NaN

Drop NaN:
   Name   Age   Salary
0  Eve  28.0  55000.0

Fill NaN:
       Name   Age   Salary
0      Eve  28.0  55000.0
1    Frank  34.0  62000.0
2  Unknown  40.0  58500.0


In [None]:
#  Merging & Joining
df_a = pd.DataFrame({"ID": [1, 2, 3], "Name": ["Alice", "Bob", "Charlie"]})
df_b = pd.DataFrame({"ID": [1, 2, 3], "Dept": ["HR", "IT", "Finance"]})

merged = pd.merge(df_a, df_b, on="ID")#merged according to ID
print("Merged DataFrame:\n", merged)


Merged DataFrame:
    ID     Name     Dept
0   1    Alice       HR
1   2      Bob       IT
2   3  Charlie  Finance
