In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_data(df):
    # 1. Handle missing values by filling with median
    df.fillna(df.median(), inplace=True)

    # 2. Separate features and target
    X = df.drop('target', axis=1)
    y = df['target']

    # 3. Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 4. Normalize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test


data = pd.DataFrame({
    'feature1': [1, 2, 3, 4, 5, None],
    'feature2': [10, 20, 30, 40, 50, 60],
    'target': [0, 1, 0, 1, 0, 1]
})
X_train, X_test, y_train, y_test = preprocess_data(data)


In [4]:
# !pip install pandas
# import pandas as pd
# print(dir(pd))

In [23]:
import pandas as pd

data = pd.DataFrame({
    'feature1': [1, 2, 3, 4, 5, None],
    'feature2': [10, 20, 30, 40, 50, 60],
    'target': [0, 1, 0, 1, 0, 1]
})

In [24]:
data

Unnamed: 0,feature1,feature2,target
0,1.0,10,0
1,2.0,20,1
2,3.0,30,0
3,4.0,40,1
4,5.0,50,0
5,,60,1


In [25]:
df = data

In [27]:
df.fillna(df.median(), inplace = True)

In [30]:
X = df.drop(['target'], axis =1)
y= df['target']

In [31]:
# help(df.drop)

In [32]:
X

Unnamed: 0,feature1,feature2
0,1.0,10
1,2.0,20
2,3.0,30
3,4.0,40
4,5.0,50
5,3.0,60


In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
# help(train_test_split)

In [43]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, mean_squared_error, silhouette_score

def preprocess_data(df):
    # 1. Handle missing values by filling with the median
    df.fillna(df.median(), inplace=True)

    # 2. Separate features and target
    X = df.drop('target', axis=1)
    y = df['target']

    # 3. Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 4. Normalize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test

def train_classification_model(X_train, X_test, y_train, y_test):
    # Train a logistic regression classifier for classification tasks
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    
    # Predict and evaluate accuracy
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f'Classification Accuracy: {accuracy}')
    return clf

def train_regression_model(X_train, X_test, y_train, y_test):
    # Train a linear regression model for regression tasks
    reg = LinearRegression()
    reg.fit(X_train, y_train)
    
    # Predict and evaluate using Mean Squared Error
    y_pred = reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    print(f'Regression Mean Squared Error: {mse}')
    return reg

def apply_clustering(X):
    # Apply KMeans clustering for unsupervised learning
    kmeans = KMeans(n_clusters=2, random_state=42)
    kmeans.fit(X)
    
    # Predict cluster labels and evaluate using silhouette score
    cluster_labels = kmeans.predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    
    print(f'Clustering Silhouette Score: {silhouette_avg}')
    return kmeans, cluster_labels


data = pd.DataFrame({
    'feature1': [1, 2, 3, 4, 5, None],
    'feature2': [10, 20, 30, 40, 50, 60],
    'target': [0, 1, 0, 1, 0, 1]
})

# 1. Preprocess data
X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(data)

# 2. Train classification model (Logistic Regression)
print("\nTraining Classification Model:")
classifier = train_classification_model(X_train_scaled, X_test_scaled, y_train, y_test)

# 3. Train regression model (Linear Regression)
print("\nTraining Regression Model:")
regressor = train_regression_model(X_train_scaled, X_test_scaled, y_train, y_test)

# 4. Apply KMeans clustering
print("\nApplying KMeans Clustering:")
kmeans_model, cluster_labels = apply_clustering(X_train_scaled)



Training Classification Model:
Classification Accuracy: 0.5

Training Regression Model:
Regression Mean Squared Error: 0.2777777777777779

Applying KMeans Clustering:
Clustering Silhouette Score: 0.06525381975367067
