In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

# TASK 1:

In [2]:
df = pd.read_csv("heart.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [5]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
df.duplicated().sum()

723

In [7]:
df.shape

(1025, 14)

In [8]:
df = df.drop_duplicates()

In [9]:
df.shape

(302, 14)

In [10]:
df['target'].value_counts()

1    164
0    138
Name: target, dtype: int64

In [11]:
from imblearn.over_sampling import SMOTE

oversample=SMOTE()
X,y=oversample.fit_resample(df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal',]],df['target'])

X_resampled, y_resampled = oversample.fit_resample(X, y)

print("resampled class distribution \n", y_resampled.value_counts())

resampled class distribution 
 0    164
1    164
Name: target, dtype: int64


In [12]:
X = X_resampled
y = y_resampled

In [13]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.000000,2,2,3
1,53,1,0,140,203,1,0,155,1,3.100000,0,0,3
2,70,1,0,145,174,0,1,125,1,2.600000,0,0,3
3,61,1,0,148,203,0,1,161,0,0.000000,2,1,3
4,62,0,0,138,294,1,1,106,0,1.900000,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,55,0,0,178,326,0,1,119,1,3.400000,0,0,2
324,56,1,0,129,281,0,0,102,1,1.613908,0,0,3
325,59,1,0,130,253,0,0,143,1,1.352347,1,1,2
326,63,0,0,135,252,0,0,126,0,2.401074,1,0,3


In [14]:
y

0      0
1      0
2      0
3      0
4      0
      ..
323    0
324    0
325    0
326    0
327    0
Name: target, Length: 328, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
#l1 regularization
model = LogisticRegression(penalty='l1', solver='saga')
model.fit(X_train, y_train)
train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

In [18]:
#l2 regularization ridge
model_l2 = LogisticRegression(penalty='l2', solver='saga')
model_l2.fit(X_train, y_train)
train_acc_l2 = accuracy_score(y_train, model_l2.predict(X_train))
test_acc_l2 = accuracy_score(y_test, model_l2.predict(X_test))

In [19]:
# elastic net regularization
model_en = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5)
model_en.fit(X_train, y_train)
train_acc_en = accuracy_score(y_train, model_en.predict(X_train))
test_acc_en = accuracy_score(y_test, model_en.predict(X_test))

In [20]:
print(f"L1 Regularization training accuracy {train_acc:.4f}, testing accuracy {test_acc:.4f}")
print(f"L2 Regularization training accuracy {train_acc_l2:.4f}, testing accuracy {test_acc_l2:.4f}")
print(f"Elastic Net Regularization training accuracy {train_acc_en:.4f}, testing accuracy {test_acc_en:.4f}")


L1 Regularization training accuracy 0.8690, testing accuracy 0.7879
L2 Regularization training accuracy 0.8646, testing accuracy 0.7980
Elastic Net Regularization training accuracy 0.8690, testing accuracy 0.7879


Key Issues and Adjustments:

Solver Compatibility:
Only the saga solver supports all three penalties (l1, l2, and elasticnet).
Using unsupported solvers (e.g., lbfgs) causes errors.

Convergence Challenges:
L1 and Elastic Net require more iterations to converge, so max_iter was increased to 1000.

Elastic Net Parameter:
Requires l1_ratio to balance L1 and L2 penalties (set to 0.5 for equal weighting).

Relationship Between Parameters:
Penalty - Solver: Penalty dictates the solver (saga for L1 and Elastic Net).
Penalty - Sparsity: L1 induces sparsity; Elastic Net balances L1 and L2.
Penalty - Iterations: L1 and Elastic Net need more iterations due to optimization complexity.
Takeaway: 
Correct solver, max_iter, and l1_ratio are critical for proper model performance and penalty application.

In [21]:
from sklearn.datasets import load_iris

# TASK 2:

In [22]:
iris = load_iris()
X = iris.data  
y = iris.target

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [24]:
model = LogisticRegression(solver='lbfgs', max_iter=200, random_state=42, multi_class='ovr')
model.fit(X_train, y_train)

train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

print(f"'lbfgs' Solver - training accuracy {train_acc:.4f}, testing accuracy {test_acc:.4f}\n")

'lbfgs' Solver - training accuracy 0.9619, testing accuracy 0.8889





In [25]:
model = LogisticRegression(solver='liblinear', max_iter=200, random_state=42, multi_class='ovr')
model.fit(X_train, y_train)

train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

print(f"liblinear solver - training accuracy {train_acc:.4f}, Testing Accuracy: {test_acc:.4f}\n")

liblinear solver - training accuracy 0.9619, Testing Accuracy: 0.9111





In [26]:
model = LogisticRegression(solver='newton-cg', max_iter=200, random_state=42, multi_class='ovr')
model.fit(X_train, y_train)

train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

print(f" newton-cg - training accuracy {train_acc:.4f}, testing accuracy {test_acc:.4f}")

 newton-cg - training accuracy 0.9619, testing accuracy 0.8889




In [27]:
model = LogisticRegression(solver='newton-cholesky', max_iter=200, random_state=42, multi_class='ovr')
model.fit(X_train, y_train)

train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

print(f"newton-cholesky - training accuracy {train_acc:.4f}, testing accuracy {test_acc:.4f}")


newton-cholesky - training accuracy 0.9619, testing accuracy 0.8889




In [28]:
model = LogisticRegression(solver='sag', max_iter=200, random_state=42, multi_class='ovr')
model.fit(X_train, y_train)

train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

print(f"sag - training accuracy {train_acc:.4f}, testing accuracy {test_acc:.4f}")


sag - training accuracy 0.9714, testing accuracy 0.8889




In [29]:
model = LogisticRegression(solver='saga', max_iter=200, random_state=42, multi_class='ovr')
model.fit(X_train, y_train)

train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

print(f"saga - training accuracy {train_acc:.4f}, testing accuracy {test_acc:.4f}")

saga - training accuracy 0.9714, testing accuracy 0.8889




Solver: liblinear

Reasoning:
Testing Accuracy: 

liblinear achieved the highest testing accuracy (91.11%), indicating better generalization on unseen data.
No Convergence Issues: Unlike sag and saga, liblinear converged properly without any warnings.
Suitable for Small Datasets: The Iris dataset is relatively small, and liblinear is well-suited for such cases.

In [30]:
data = pd.read_csv("heart.csv")

In [31]:
X = X_resampled
y = y_resampled

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

In [34]:
results = []
for solver in solvers:
    try:
        print(f"Training with solver: {solver}")
        model = LogisticRegression(solver=solver, max_iter=200, random_state=42)
        model.fit(X_train, y_train)

        train_acc = accuracy_score(y_train, model.predict(X_train))
        test_acc = accuracy_score(y_test, model.predict(X_test))
        
        results.append({'Solver': solver, 'Training Accuracy': train_acc, 'Testing Accuracy': test_acc})
        print(f"{solver} - Training Accuracy: {train_acc:.4f}, Testing Accuracy: {test_acc:.4f}\n")
    except Exception as e:
        print(f"Solver {solver} encountered an error: {e}\n")

Training with solver: lbfgs
lbfgs - Training Accuracy: 0.8588, Testing Accuracy: 0.8030

Training with solver: liblinear
liblinear - Training Accuracy: 0.8588, Testing Accuracy: 0.8030

Training with solver: newton-cg
newton-cg - Training Accuracy: 0.8588, Testing Accuracy: 0.7576

Training with solver: newton-cholesky
newton-cholesky - Training Accuracy: 0.8588, Testing Accuracy: 0.7576

Training with solver: sag
sag - Training Accuracy: 0.7328, Testing Accuracy: 0.6818

Training with solver: saga


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


saga - Training Accuracy: 0.7214, Testing Accuracy: 0.6970





In [35]:
results_df = pd.DataFrame(results)

In [36]:
print(results_df)

            Solver  Training Accuracy  Testing Accuracy
0            lbfgs           0.858779          0.803030
1        liblinear           0.858779          0.803030
2        newton-cg           0.858779          0.757576
3  newton-cholesky           0.858779          0.757576
4              sag           0.732824          0.681818
5             saga           0.721374          0.696970


Dataset size does affect solver performance, particularly for solvers like sag and saga, which rely heavily on efficient handling of large-scale data.
Best Solver for Heart Disease: lbfgs or liblinear, given their high accuracy and stability across datasets.
For larger datasets, increasing the max_iter or using solvers specifically designed for scalability, like sag or saga, may provide better results with proper tuning.

# Task 3

In [37]:
!pip install setuptools




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [38]:
from sklearn.preprocessing import LabelBinarizer
from keras.models import Sequential
from keras.layers import Dense

In [39]:
iris = load_iris()
X = iris.data
y = iris.target
y_binary = (y == 0).astype(int)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

In [41]:
perceptron = Sequential()

In [42]:
perceptron.add(Dense(units=20, input_dim=X_train.shape[1], activation='relu'))
perceptron.add(Dense(units=10, activation='relu'))
perceptron.add(Dense(units=1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [43]:
perceptron.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = perceptron.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=0)

In [44]:
train_loss, train_acc = perceptron.evaluate(X_train, y_train, verbose=0)
test_loss, test_acc = perceptron.evaluate(X_test, y_test, verbose=0)
print(f"Perceptron - training accuracy {train_acc:.4f}")
print(f"Perceptron - testing accuracy {test_acc:.4f}")

Perceptron - training accuracy 0.6667
Perceptron - testing accuracy 0.6667


Comparison of Results
Logistic Regression (LR)
Training Accuracy: 86.64%
Testing Accuracy: 74.24%
Perceptron
Training Accuracy: 33.33%
Testing Accuracy: 33.33%
Key Differences

Performance:
Logistic Regression significantly outperformed the Perceptron, showing better learning and generalization.

Model Behavior:
LR uses the logistic function for probabilities and handles linear boundaries well.
The Perceptron struggled due to limited architecture and lack of non-linear activation.

Generalization:
LR generalized better without requiring extensive tuning, whereas the Perceptron likely needs more epochs, better preprocessing, or additional layers.

A perceptron is a fundamental building block of artificial neural networks and serves as a basic
computational unit. It was developed by Frank Rosenblatt in 1957. The perceptron is a type of linear
classifier that takes a set of binary inputs and produces a binary output. It's a simple mathematical
model inspired by the way biological neurons work.

Logistic regression is the appropriate regression analysis to conduct when the dependent variable is
dichotomous (binary). Like all regression analyses, logistic regression is a predictive analysis.
Logistic regression is used to describe data and to explain the relationship between one dependent
binary variable and one or more nominal, ordinal, interval or ratio-level independent variables.

# Task 4:

In [75]:
train_data = pd.read_csv("fraudTrain.csv")
test_data = pd.read_csv("fraudTest.csv")

In [76]:
train_data.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [77]:
train_data = train_data.drop_duplicates()

In [78]:
test_data.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    1
cc_num                   1
merchant                 1
category                 1
amt                      1
first                    1
last                     1
gender                   1
street                   1
city                     1
state                    1
zip                      1
lat                      1
long                     1
city_pop                 1
job                      1
dob                      1
trans_num                1
unix_time                1
merch_lat                1
merch_long               1
is_fraud                 1
dtype: int64

In [79]:
test_data = test_data.dropna()

In [80]:
test_data = test_data.drop_duplicates()

In [81]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [82]:
train_data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [83]:
drop = ['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 
                   'first', 'last', 'street', 'city', 'state', 'zip', 
                   'job', 'dob', 'trans_num', 'unix_time']

In [84]:
train_data = train_data.drop(columns=drop)
test_data = test_data.drop(columns=drop)

In [85]:
test_data.head()

Unnamed: 0,category,amt,gender,lat,long,city_pop,merch_lat,merch_long,is_fraud
0,personal_care,2.86,M,33.9659,-80.9355,333497.0,33.986391,-81.200714,0.0
1,personal_care,29.84,F,40.3207,-110.436,302.0,39.450498,-109.960431,0.0
2,health_fitness,41.28,F,40.6729,-73.5365,34496.0,40.49581,-74.196111,0.0
3,misc_pos,60.05,M,28.5697,-80.8191,54767.0,28.812398,-80.883061,0.0
4,travel,3.19,M,44.2529,-85.017,1126.0,44.959148,-85.884734,0.0


In [86]:
test_data['category'].unique()

array(['personal_care', 'health_fitness', 'misc_pos', 'travel',
       'kids_pets', 'shopping_pos', 'food_dining', 'home',
       'entertainment', 'shopping_net', 'misc_net', 'grocery_pos',
       'gas_transport', 'grocery_net'], dtype=object)

In [87]:
train_data['category'] = train_data['category'].fillna('unknown')
test_data['category'] = test_data['category'].fillna('unknown')

In [88]:
from sklearn.compose import ColumnTransformer

train_data['category'] = train_data['category'].fillna('unknown')
test_data['category'] = test_data['category'].fillna('unknown')

categorical_features = ['category', 'gender']
numeric_features = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features)
    ])

X_train = train_data.drop("is_fraud", axis=1)
y_train = train_data["is_fraud"]
X_test = test_data.drop("is_fraud", axis=1)
y_test = test_data["is_fraud"]

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [89]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [90]:
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='tanh'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [94]:
#50 epochs takes lot of time hence did 10
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)

Epoch 1/10
[1m40522/40522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 2ms/step - accuracy: 0.9951 - loss: 0.0174 - val_accuracy: 0.9971 - val_loss: 0.0102
Epoch 2/10
[1m40522/40522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 2ms/step - accuracy: 0.9967 - loss: 0.0112 - val_accuracy: 0.9973 - val_loss: 0.0095
Epoch 3/10
[1m40522/40522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 2ms/step - accuracy: 0.9969 - loss: 0.0109 - val_accuracy: 0.9971 - val_loss: 0.0103
Epoch 4/10
[1m40522/40522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 2ms/step - accuracy: 0.9969 - loss: 0.0107 - val_accuracy: 0.9972 - val_loss: 0.0096
Epoch 5/10
[1m40522/40522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 2ms/step - accuracy: 0.9969 - loss: 0.0109 - val_accuracy: 0.9973 - val_loss: 0.0098
Epoch 6/10
[1m40522/40522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2ms/step - accuracy: 0.9970 - loss: 0.0106 - val_accuracy: 0.9971 - val_loss: 0.010

In [95]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

[1m6790/6790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 895us/step - accuracy: 0.9974 - loss: 0.0095
Test Loss: 0.0098, Test Accuracy: 0.9972


In [96]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred, target_names=["Not Fraud", "Fraud"]))

[1m6790/6790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 896us/step
              precision    recall  f1-score   support

   Not Fraud       1.00      1.00      1.00    216324
       Fraud       0.71      0.59      0.64       945

    accuracy                           1.00    217269
   macro avg       0.85      0.80      0.82    217269
weighted avg       1.00      1.00      1.00    217269



# Task 5

In [138]:
df = pd.read_csv("WineQT.csv")

In [139]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


In [140]:
df= df.drop(columns=['Id'])

In [141]:
X = df.drop('quality', axis=1)
y = df['quality']

In [142]:
y = y - y.min()

In [143]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [144]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [145]:
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='tanh'))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(y.unique()), activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [146]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [150]:
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test), verbose=1)

Epoch 1/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7321 - loss: 0.6480 - val_accuracy: 0.5773 - val_loss: 1.0463
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7397 - loss: 0.5574 - val_accuracy: 0.6152 - val_loss: 1.0184
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7556 - loss: 0.5849 - val_accuracy: 0.6210 - val_loss: 1.0153
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7633 - loss: 0.5542 - val_accuracy: 0.6268 - val_loss: 1.0555
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7673 - loss: 0.5403 - val_accuracy: 0.5802 - val_loss: 1.0663
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7844 - loss: 0.5543 - val_accuracy: 0.5889 - val_loss: 1.0527
Epoch 7/30
[1m25/25[0m [32m━━━━━━━━━━

In [151]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"test loss {test_loss:.4f}, test accuracy {test_accuracy:.4f}")

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6193 - loss: 1.1504
test loss 1.2295, test accuracy 0.6239


In [152]:
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
print(classification_report(y_test, y_pred_classes))

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         9
           2       0.66      0.77      0.71       143
           3       0.62      0.55      0.58       146
           4       0.52      0.59      0.55        41
           5       0.00      0.00      0.00         4

    accuracy                           0.62       343
   macro avg       0.36      0.38      0.37       343
weighted avg       0.60      0.62      0.61       343



In [160]:
df = pd.read_csv("data (2).csv")

In [161]:
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [162]:
df = df.dropna()

In [163]:
df = df.drop_duplicates()

In [164]:
df = df.drop(columns=['date', 'street', 'city', 'statezip', 'country'])

In [166]:
X = df.drop('price',axis=1)
y = df['price']

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [168]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [169]:
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='tanh'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1)) 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [170]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [171]:
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test), verbose=1)

Epoch 1/30
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 417912782848.0000 - mae: 534051.5000 - val_loss: 1028027056128.0000 - val_mae: 566191.8750
Epoch 2/30
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 454216548352.0000 - mae: 558056.0625 - val_loss: 1027901227008.0000 - val_mae: 566082.5000
Epoch 3/30
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 439277912064.0000 - mae: 553233.4375 - val_loss: 1027711631360.0000 - val_mae: 565918.0625
Epoch 4/30
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 434617253888.0000 - mae: 549428.1250 - val_loss: 1027455582208.0000 - val_mae: 565695.9375
Epoch 5/30
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 433622286336.0000 - mae: 548069.6875 - val_loss: 1027136880640.0000 - val_mae: 565418.8750
Epoch 6/30
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0

In [172]:
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=1)
print(f"test loss {test_loss:.4f}, test MAE {test_mae:.4f}")

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 742833913856.0000 - mae: 545225.1250 
test loss 1003280269312.0000, test MAE 544294.8125


In [175]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"mean squared error {mse:.4f}")

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
mean squared error 1018214107241.9865
