In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
data = pd.read_csv("Expanded_data_with_more_features.csv")
data = data.drop("Unnamed: 0", axis=1)
data.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


In [2]:
print("1. Male count:", len(data.loc[data["Gender"] == "male"]))
print("1. Female count:", len(data.loc[data["Gender"] == "female"]))

1. Male count: 15217
1. Female count: 15424


In [3]:
print("2. Male percentage:", len(data.loc[data["Gender"] == "male"]) / len(data))
print("3. Female percentage:", len(data.loc[data["Gender"] == "female"]) / len(data))

2. Male percentage: 0.496622172905584
3. Female percentage: 0.503377827094416


In [4]:
ethnic_groups = data.groupby("EthnicGroup")["MathScore"].median()
f"4. Math: {ethnic_groups.idxmax()} - {ethnic_groups.max()}"

'4. Math: group E - 76.0'

In [5]:
ethnic_groups = data.groupby("EthnicGroup")["ReadingScore"].median()
f"5. Reading: {ethnic_groups.idxmax()} - {ethnic_groups.max()}"

'5. Reading: group E - 75.0'

In [6]:
ethnic_groups = data.groupby("EthnicGroup")["WritingScore"].median()
f"6. Writing: {ethnic_groups.idxmax()} - {ethnic_groups.max()}"

'6. Writing: group E - 73.0'

In [7]:
test = data.groupby("TestPrep")["MathScore"].median()
f'7. Math: {test["completed"]}'

'7. Math: 70.0'

In [8]:
test = data.groupby("TestPrep")["ReadingScore"].median()
f'8. Reading: {test["completed"]}'

'8. Reading: 74.0'

In [9]:
test = data.groupby("TestPrep")["WritingScore"].median()
f'9. Writing: {test["completed"]}'

'9. Writing: 75.0'

In [10]:
parents = data.groupby("ParentMaritalStatus")["ParentMaritalStatus"].count()
print("10. Parents are married:", parents["married"])
print("11. Single parent:", parents["single"])

10. Parents are married: 16844
11. Single parent: 7097


In [11]:
sport = data.groupby("PracticeSport")["MathScore"].median()
print("12. sport[\"regularly\"] = ", sport["regularly"])
print("12. sport[\"never\"] = ", sport["never"])
print("12. sport[\"regularly\"] > sport[\"never\"]:", sport["regularly"] > sport["never"])

12. sport["regularly"] =  68.0
12. sport["never"] =  64.0
12. sport["regularly"] > sport["never"]: True


In [12]:
sport = data.groupby("PracticeSport")["ReadingScore"].median()
print("13. sport[\"regularly\"] = ", sport["regularly"])
print("13. sport[\"never\"] = ", sport["never"])
print("13. sport[\"regularly\"] > sport[\"never\"]:", sport["regularly"] > sport["never"])

13. sport["regularly"] =  70.0
13. sport["never"] =  68.0
13. sport["regularly"] > sport["never"]: True


In [13]:
transport = data.groupby("TransportMeans")["TransportMeans"].count()
print("14. Number of students using school bus: ", transport["school_bus"])

14. Number of students using school bus:  16145


In [14]:
print("15. Number of students using private transport: ", transport["private"])

15. Number of students using private transport:  11362


In [15]:
from sklearn.preprocessing import OrdinalEncoder
data = data.dropna()
ordinal_encoder = OrdinalEncoder()

In [16]:
X = ordinal_encoder.fit_transform(data.drop("TestPrep", axis=1))

In [17]:
Y = np.array([0 if i == "none" else 1 for i in data["TestPrep"].values])

In [18]:
X_train, X_test = X[:9000], X[9000:]
Y_train, Y_test = Y[:9000], Y[9000:]

In [19]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=13)
sgd_clf.fit(X_train, Y_train == 1)

In [20]:
print("Prediction: ", sgd_clf.predict([X[100]]))
print("Actual: ", Y[100])

Prediction:  [ True]
Actual:  1


In [21]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, Y_train == 1, cv=3, scoring="accuracy")

array([0.657     , 0.60633333, 0.697     ])

In [22]:
from tensorflow import keras

model = keras.models.Sequential([
    keras.layers.Dense(64, input_dim=X_train.shape[1], activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)
])

X_valid, X_train_2 = X_train[:5000], X_train[5000:]
Y_valid, Y_train_2 = Y_train[:5000], Y_train[5000:]

model.compile(loss='mean_squared_error', optimizer='adam', metrics=["accuracy"])
model.fit(X_train, Y_train, epochs=100, batch_size=32, verbose=1, validation_data=(X_valid, Y_valid))

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5932 - loss: 16.7371 - val_accuracy: 0.6916 - val_loss: 0.2104
Epoch 2/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 921us/step - accuracy: 0.6859 - loss: 0.2081 - val_accuracy: 0.7168 - val_loss: 0.1880
Epoch 3/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 890us/step - accuracy: 0.7038 - loss: 0.1959 - val_accuracy: 0.7050 - val_loss: 0.1935
Epoch 4/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 895us/step - accuracy: 0.7211 - loss: 0.1901 - val_accuracy: 0.7278 - val_loss: 0.1863
Epoch 5/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 925us/step - accuracy: 0.7204 - loss: 0.1899 - val_accuracy: 0.7198 - val_loss: 0.1876
Epoch 6/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 890us/step - accuracy: 0.7231 - loss: 0.1869 - val_accuracy: 0.6990 - val_loss: 0.2043
Epoch 7/100
[1m282/282

<keras.src.callbacks.history.History at 0x1bca57b0c10>

In [23]:
proba = model.predict(X_test[:4])
proba.round(2)

results = model.evaluate(X_test, Y_test)
print(f"Test loss: {results[0]}, Test accuracy:{results[1]}")

model = keras.models.Sequential([
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])

history = model.fit(X_train_2, Y_train_2, epochs=30, validation_data=(X_valid, Y_valid))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 539us/step - accuracy: 0.7303 - loss: 0.1832
Test loss: 0.18515242636203766, Test accuracy:0.7260568141937256
Epoch 1/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5526 - loss: 5.1334 - val_accuracy: 0.5514 - val_loss: 0.6810
Epoch 2/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6381 - loss: 0.6693 - val_accuracy: 0.6538 - val_loss: 0.6571
Epoch 3/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6563 - loss: 0.6501 - val_accuracy: 0.6716 - val_loss: 0.6452
Epoch 4/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6632 - loss: 0.6340 - val_accuracy: 0.6850 - val_loss: 0.6086
Epoch 5/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6544

In [24]:
Y_pred = np.argmax(model.predict(X_test[:42]), axis=-1)
Y_pred

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1],
      dtype=int64)

In [25]:
Y_test[:42]

array([0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1])

In [27]:
Y_test[:42] == Y_pred

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
       False, False, False, False,  True, False,  True, False,  True,
        True, False,  True, False, False,  True,  True,  True, False,
       False,  True, False,  True, False,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True])

In [28]:
results = model.evaluate(X_test, Y_test)
print(f"Test loss: {results[0]}, Test accuracy:{results[1]}")

[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 616us/step - accuracy: 0.7048 - loss: 0.5755
Test loss: 0.5786350965499878, Test accuracy:0.7013570070266724
