In [1]:
import numpy as np
import pandas as pd
 
# Set random seed for reproducibility
np.random.seed(42)
 
# Generate random data for 50 rows with different features
feature1 = np.round(np.random.uniform(50, 100, 50), 2)  # Feature 1: e.g., hours studied
feature2 = np.round(np.random.uniform(20, 80, 50), 2)   # Feature 2: e.g., attendance percentage
 
# Generate a target variable based on a simple rule for practice
target = np.where((feature1 > 75) & (feature2 > 50), 'Yes', 'No')
 
# Create the DataFrame
df = pd.DataFrame({
    'hours_studied': feature1,
    'attendance_percentage': feature2,
    'placement': target
})

In [2]:
df

Unnamed: 0,hours_studied,attendance_percentage,placement
0,68.73,78.18,No
1,97.54,66.51,Yes
2,86.6,76.37,Yes
3,79.93,73.69,Yes
4,57.8,55.87,No
5,57.8,75.31,No
6,52.9,25.31,No
7,93.31,31.76,No
8,80.06,22.71,No
9,85.4,39.52,No


In [15]:
df['placement'] = df['placement'].replace({'Yes': 1, 'No': 0})

In [17]:
df

Unnamed: 0,hours_studied,attendance_percentage,placement
0,68.73,78.18,0
1,97.54,66.51,1
2,86.6,76.37,1
3,79.93,73.69,1
4,57.8,55.87,0
5,57.8,75.31,0
6,52.9,25.31,0
7,93.31,31.76,0
8,80.06,22.71,0
9,85.4,39.52,0


In [19]:
# Extract features (X) and target variable (y)
X= df[['hours_studied', 'attendance_percentage']]
y = df['placement']

In [21]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Fit and transform the data
X = scaler.fit_transform(X)

# Convert the result back to a DataFrame if needed
X = pd.DataFrame(X, columns=['hours_studied', 'attendance_percentage'])

# Display the scaled DataFrame
X.head()

Unnamed: 0,hours_studied,attendance_percentage
0,-0.249391,1.564504
1,1.765345,0.924195
2,1.000291,1.465193
3,0.533846,1.318147
4,-1.013746,0.340401


In [23]:
### Outlier or not
X.describe().round(2)

Unnamed: 0,hours_studied,attendance_percentage
count,50.0,50.0
mean,0.0,-0.0
std,1.01,1.01
min,-1.49,-1.61
25%,-0.92,-0.91
50%,-0.03,0.05
75%,0.71,0.91
max,1.83,1.62


In [25]:
###Missing value or not

X.isnull().sum()

hours_studied            0
attendance_percentage    0
dtype: int64

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
from sklearn.linear_model import LogisticRegression

# Create the logistic regression model
lg = LogisticRegression()

# Train the model
lg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lg.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

In [39]:
accuracy

0.9

In [43]:
# Get the probability values of the predictions
y_pred_proba = lg.predict_proba(X_test)
probability_values = y_pred_proba[:, 1]

In [45]:
probability_values.round(2)

array([0.01, 0.05, 0.53, 0.16, 0.34, 0.01, 0.07, 0.58, 0.  , 0.3 ])

In [47]:
y_pred

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [49]:
y_test

13    0
39    0
30    1
45    0
17    1
48    0
26    0
25    1
32    0
19    0
Name: placement, dtype: int64

In [51]:
# Creating a DataFrame to display X_test, y_test, and y_pred
results_df = pd.DataFrame(X_test)
results_df['y_test'] = y_test.values
results_df['y_pred_prob'] = y_pred_proba[:, 1]
results_df['y_pred'] = y_pred
results_df['Match'] = results_df['y_test'] == results_df['y_pred']
results_df

Unnamed: 0,hours_studied,attendance_percentage,y_test,y_pred_prob,y_pred,Match
13,-0.816538,-0.45299,0,0.008061,0,True
39,-0.020014,-0.073304,0,0.049933,0,True
30,0.565315,1.213898,1,0.534819,1,True
45,0.757628,-0.001975,0,0.163538,0,True
17,0.275797,1.013081,1,0.343942,0,False
48,0.352722,-1.543764,0,0.007588,0,True
26,-0.861294,0.911576,0,0.071626,0,True
25,1.18631,0.772211,1,0.584448,1,True
32,-1.331936,-0.538584,0,0.003178,0,True
19,-0.541007,1.621018,0,0.296288,0,True


In [53]:
match_count = results_df['Match'].value_counts()
match_count

Match
True     9
False    1
Name: count, dtype: int64

In [55]:
accuracy = accuracy_score(y_test, y_pred)

In [57]:
accuracy

0.9

In [67]:
from sklearn.metrics import confusion_matrix, classification_report

conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display evaluation metrics
print("\nModel Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_rep)


Model Accuracy: 0.9

Confusion Matrix:
 [[7 0]
 [1 2]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.93         7
           1       1.00      0.67      0.80         3

    accuracy                           0.90        10
   macro avg       0.94      0.83      0.87        10
weighted avg       0.91      0.90      0.89        10

