#Practise

##Dataset for practise

In [2]:
import pandas as pd
import numpy as np

np.random.seed(123)

# Generate numerical columns with NaN values
num1 = np.random.randn(500)
num2 = np.random.uniform(10, 50, size=500)
num3 = np.random.randint(1, 6, size=500).astype('float')
num1[np.random.choice(500, 60, replace=False)] = np.nan
num2[np.random.choice(500, 60, replace=False)] = np.nan
num3[np.random.choice(500, 60, replace=False)] = np.nan

# Generate messy categorical/text columns
cat1 = np.random.choice(['apple', 'banana', 'grape', np.nan], size=500, p=[0.3, 0.3, 0.3, 0.1])
cat2 = np.random.choice(['A', 'B', 'C', 'D'], size=500)
cat3 = np.random.choice(['Yes', 'No', np.nan], size=500, p=[0.45, 0.45, 0.10])

# Combine into DataFrame and add untidiness
df_untidy = pd.DataFrame({
    'Score': num1,
    'Height_cm': num2,
    'Rating': num3,
    'Fruit': cat1,
    'Group': cat2,
    'IsActive': cat3
})

# Add untidy issues:
df_untidy.loc[df_untidy.sample(frac=0.15, random_state=1).index, 'Height_cm'] = \
    df_untidy['Height_cm'].dropna().astype(str) + 'cm'   # Mix data type in Height_cm

df_untidy.loc[df_untidy.sample(frac=0.15, random_state=2).index, 'Rating'] = \
    'Rating: ' + df_untidy['Rating'].dropna().astype(str) # Prefix string for some ratings

df_untidy.head()


 '47.7445897977863cm' '12.199264419087633cm' '28.255294989080216cm'
 '49.46319556401813cm' '24.62082473109778cm' '14.069842039264948cm'
 '30.047591231707173cm' '39.596908878071915cm' '47.903273405497885cm'
 '18.383678364591226cm' '20.573119172482315cm' '26.918929348570554cm'
 '49.60956000856316cm' '42.872544601068384cm' '47.28354972175643cm' nan
 nan '36.95711914375434cm' nan '25.124837252276944cm'
 '41.69467488795425cm' '28.67398840853555cm' '31.546329651552767cm'
 '14.629781437963878cm' '10.509502547356245cm' '18.213990989134608cm'
 '24.690622519316705cm' '23.881006122675878cm' '16.30134384410546cm' nan
 '29.66207244015815cm' nan '45.91010976816148cm' nan
 '31.599083317303908cm' '25.60501244185592cm' '43.06120649429532cm' nan
 '12.847775069640711cm' '12.664116985961948cm' '47.658679896221cm'
 '33.2449203552472cm' '39.08764020242252cm' '28.33330050691903cm'
 '22.962576174780793cm' '27.894375469792145cm' '32.038131009330215cm'
 '14.575393291450958cm' nan nan '11.712756967808463cm' nan


Unnamed: 0,Score,Height_cm,Rating,Fruit,Group,IsActive
0,-1.085631,,2.0,banana,D,Yes
1,0.997345,16.480034,Rating: 5.0,apple,A,No
2,0.282978,49.244711,,banana,B,No
3,-1.506295,,3.0,grape,D,
4,-0.5786,31.599083317303908cm,,banana,C,No


Q1. Identify columns with missing values and demonstrate at least two methods for imputing or filling these missing values (e.g., mean for numerics, mode for categoricals).

In [4]:
import pandas as pd

df = pd.DataFrame({
    "Age": [25, None, 30, 22],
    "City": ["Delhi", "Mumbai", None, "Chennai"]
})

print(df.isnull().sum())
df["Age"].fillna(df["Age"].mean(), inplace=True)

df["City"].fillna(df["City"].mode()[0], inplace=True)

print(df)

Age     1
City    1
dtype: int64
         Age     City
0  25.000000    Delhi
1  25.666667   Mumbai
2  30.000000  Chennai
3  22.000000  Chennai


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["City"].fillna(df["City"].mode()[0], inplace=True)


 Q2.  Identify columns with non-numeric (categorical) data and convert them into a numeric format using encoding techniques such as one-hot encoding or label encoding.

In [5]:
df["City_Label"] = df["City"].astype("category").cat.codes

df_encoded = pd.get_dummies(df, columns=["City"])
print(df_encoded)

         Age  City_Label  City_Chennai  City_Delhi  City_Mumbai
0  25.000000           1         False        True        False
1  25.666667           2         False       False         True
2  30.000000           0          True       False        False
3  22.000000           0          True       False        False


Q3. Detect any columns in the DataFrame that contain mixed data types (such as numbers stored as strings or strings with prefixes). Write code to clean and convert these columns to appropriate, consistent types.

In [6]:
df["Salary"] = ["1000", "2000", "3k", "4000"]

df["Salary"] = df["Salary"].str.replace(r"\D", "", regex=True).astype(float)
print(df)

         Age     City  City_Label  Salary
0  25.000000    Delhi           1  1000.0
1  25.666667   Mumbai           2  2000.0
2  30.000000  Chennai           0     3.0
3  22.000000  Chennai           0  4000.0


 Q4. Apply scaling and/or normalization techniques (such as Min-Max Scaling and Standardization) to the numerical columns to prepare them for downstream machine learning tasks.

In [7]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()
df[["Age","Salary"]] = scaler.fit_transform(df[["Age","Salary"]])

scaler2 = StandardScaler()
df[["Age","Salary"]] = scaler2.fit_transform(df[["Age","Salary"]])
print(df)

            Age     City  City_Label    Salary
0 -2.332847e-01    Delhi           1 -0.507904
1  3.107977e-16   Mumbai           2  0.168625
2  1.516351e+00  Chennai           0 -1.182404
3 -1.283066e+00  Chennai           0  1.521683


Q5. Write a function to check for and report any remaining inconsistencies (missing values, mixed types, out-of-range values) in the cleaned DataFrame. Validate that the preprocessing steps have successfully prepared the data for analysis.

In [8]:
def check(data):
    return {
        "missing_values": data.isnull().sum().to_dict(),
        "dtypes": data.dtypes.to_dict(),
        "out_of_range": {col: ((data[col]<0).sum()) for col in data.select_dtypes(include="number")}
    }

print(check(df))

{'missing_values': {'Age': 0, 'City': 0, 'City_Label': 0, 'Salary': 0}, 'dtypes': {'Age': dtype('float64'), 'City': dtype('O'), 'City_Label': dtype('int8'), 'Salary': dtype('float64')}, 'out_of_range': {'Age': np.int64(2), 'City_Label': np.int64(0), 'Salary': np.int64(2)}}
