Initialization

In [25]:
import pandas as pd  # Import the pandas library for data manipulation

data={  # Define a dictionary with sample data
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24,None, 22, None],
    'City': ['New York', 'Los Angeles', None, 'Chicago'],
    'Salary': [70000, 80000, None, 60000]
}

df=pd.DataFrame(data)                   # Create a DataFrame from the dictionary

In [26]:

print(df)  # Print the DataFrame
print(df.isnull().sum())              # Print the count of missing values in each column
print(df.dropna())                    # Print the DataFrame after dropping rows with missing values
print("")
                           # Print the DataFrame after filling miSsing values in 'Age' with the mean age


      Name   Age         City   Salary
0    Alice  24.0     New York  70000.0
1      Bob   NaN  Los Angeles  80000.0
2  Charlie  22.0         None      NaN
3    David   NaN      Chicago  60000.0
Name      0
Age       2
City      1
Salary    1
dtype: int64
    Name   Age      City   Salary
0  Alice  24.0  New York  70000.0



Missing values 

In [27]:

##Handling missing values by filling them with the mean of the column

df['Age'] = df['Age'].fillna(df['Age'].mean()) # Fill missing values in 'Age' with the mean age
df['Salary']=df['Salary'].fillna(df['Salary'].mean()) # Fill missing values in 'Salary' with the mean salary
print(df) 

      Name   Age         City   Salary
0    Alice  24.0     New York  70000.0
1      Bob  23.0  Los Angeles  80000.0
2  Charlie  22.0         None  70000.0
3    David  23.0      Chicago  60000.0


Data encoding

In [49]:
import pandas as pd
df=pd.read_csv('data.csv')

In [50]:
##Encoding categorical values using label encoding ,one-hot encoding
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['encoded-Passed']=le.fit_transform(df['Passed']) # Label encoding for 'Passed' column
print(df)
print("")

       Name  Gender     City Passed  encoded-Passed
0     Alice  Female  Chicago    Yes               1
1       Bob    Male  Chicago     No               0
2   Charlie    Male  Chicago    Yes               1
3     David    Male  Houston     No               0
4       Eve  Female  Houston    Yes               1
5     Frank    Male  Houston     No               0
6     Grace  Female  Houston    Yes               1
7    Hannah  Female   Dallas     No               0
8       Ian    Male   Dallas    Yes               1
9     Julia  Female   Austin     No               0
10    Kevin    Male   Austin    Yes               1
11    Laura  Female   Austin     No               0



In [53]:
# One-hot encoding for 'City' column
print("One hot encoding for 'City' column")
one_hot_encoded_df = pd.get_dummies(df, columns=['City'], prefix='City')
bool_cols = one_hot_encoded_df.select_dtypes(include='bool').columns #take booleean columns
one_hot_encoded_df[bool_cols] = one_hot_encoded_df[bool_cols].astype(int) ##now convert boolean columns to int
print(one_hot_encoded_df)

One hot encoding for 'City' column
       Name  Gender Passed  encoded-Passed  City_Austin  City_Chicago  \
0     Alice  Female    Yes               1            0             1   
1       Bob    Male     No               0            0             1   
2   Charlie    Male    Yes               1            0             1   
3     David    Male     No               0            0             0   
4       Eve  Female    Yes               1            0             0   
5     Frank    Male     No               0            0             0   
6     Grace  Female    Yes               1            0             0   
7    Hannah  Female     No               0            0             0   
8       Ian    Male    Yes               1            0             0   
9     Julia  Female     No               0            1             0   
10    Kevin    Male    Yes               1            1             0   
11    Laura  Female     No               0            1             0   

    City_Dallas

Feature scaling

In [57]:

df=pd.read_csv('scaling.csv');

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler=StandardScaler()  # Initialize the StandardScaler
X_scaled=scaler.fit_transform(df[['feature1','feature2']])  # Fit and transform the features 
df_standard_scaled = pd.DataFrame(X_scaled, columns=['feature1_scaled', 'feature2_scaled'])
print(df_standard_scaled)


   feature1_scaled  feature2_scaled
0        -0.620919        -1.666344
1         0.162483         0.341299
2        -1.404321        -0.863287
3         1.584213         1.545886
4        -0.969098        -0.461758
5         0.858841         0.742828
6        -0.359785        -1.264815
7         1.265049         1.144357
8        -1.143187        -0.060229
9         0.626722         0.542064


In [64]:
scaler=MinMaxScaler()  # Initialize the MinMaxScaler
X_scaled=scaler.fit_transform(df[['feature1','feature2']])  # Fit and transform the features
df_minmax_scaled = pd.DataFrame(X_scaled, columns=['feature1_scaled', 'feature2_scaled'])
print(df_minmax_scaled)

   feature1_scaled  feature2_scaled
0         0.262136           0.0000
1         0.524272           0.6250
2         0.000000           0.2500
3         1.000000           1.0000
4         0.145631           0.3750
5         0.757282           0.7500
6         0.349515           0.1250
7         0.893204           0.8750
8         0.087379           0.5000
9         0.679612           0.6875


Spliting data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
data={
    'study_hours':[2,3,4,5,6,7,8,9,10,11],
    'Test_score':[50,55,60,65,70,75,80,85,90,95]
}
df=pd.DataFrame(data)
Standard_Scaler=StandardScaler()
Scaled=Standard_Scaler.fit_transform(df)
#print(pd.DataFrame(Scaled,columns=['study_hours','Test_score']))
X=df[['study_hours']]  # Features
y=df[['Test_score']]     # Target variable
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42) # Split the data into training and testing sets
print("Feature Training data")
print(X_train)
print("Feature Testing data")
print(X_test)
print("Target Training data")
print(y_train)
print("Target Testing data")
print(y_test)
# Now, X_train and y_train can be used to train a machine learning model, and X_test and y_test can be used to evaluate its performance.

Feature Training data
   study_hours
5            7
0            2
7            9
2            4
9           11
4            6
3            5
6            8
Feature Testing data
   study_hours
8           10
1            3
Target Training data
   Test_score
5          75
0          50
7          85
2          60
9          95
4          70
3          65
6          80
Target Testing data
   Test_score
8          90
1          55


Predicted values
[[90.]
 [55.]]
