# Train Test Splitting

Train-test splitting involves dividing a dataset into two parts: one for training a machine learning model (the "train" set) and the other for evaluating its performance (the "test" set).

## 1. Import Required Libraries

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split


## 2. Load Dataset

The iris dataset contains measurements of 150 iris flowers from three different species.

In [2]:
iris = datasets.load_iris()
X = iris.data    # Features (e.g., sepal and petal lengths and widths)
y = iris.target  # Target labels (species)


In [3]:
# First 10 values of the iris dataset - input data
iris.data[0:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [4]:
# output data
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## 3. Train-Test Split

We'll split the dataset into a training set and a test set using the train_test_split function. The test set will comprise 20% of the total data.

python


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 4. Verify the Split

To ensure our data was split correctly, let's print the sizes of our training and test sets:

In [6]:
print(f"Size of Training Set: {len(X_train)}")
print(f"Size of Test Set: {len(X_test)}")


Size of Training Set: 120
Size of Test Set: 30


# Now, you can use X_train and y_train to train a machine learning model, and then evaluate its performance using X_test and y_test

## 1. Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix


logreg = LogisticRegression(max_iter=200) #model selection
logreg.fit(X_train, y_train) # training
y_pred_logreg = logreg.predict(X_test) #predict

# Evaluation
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
cm_logreg = confusion_matrix(y_test, y_pred_logreg)

print(f"Accuracy: {accuracy_logreg:.4f}")
print("Confusion Matrix:")
print(cm_logreg)


Accuracy: 1.0000
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## 2. Decision Tree

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

dt = DecisionTreeClassifier(random_state=42) #model selection
dt.fit(X_train, y_train) #training
y_pred_dt = dt.predict(X_test) #prediction

# Evaluation
accuracy_dt = accuracy_score(y_test, y_pred_dt)
cm_dt = confusion_matrix(y_test, y_pred_dt)

print(f"Accuracy: {accuracy_dt:.4f}")
print("Confusion Matrix:")
print(cm_dt)


Accuracy: 1.0000
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## 3. Random Forest

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

rf = RandomForestClassifier(random_state=42) #model selection
rf.fit(X_train, y_train) #training
y_pred_rf = rf.predict(X_test) #prediction

# Evaluation
accuracy_rf = accuracy_score(y_test, y_pred_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)

print(f"Accuracy: {accuracy_rf:.4f}")
print("Confusion Matrix:")
print(cm_rf)


Accuracy: 1.0000
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## 4. K-Nearest neighbors

In [50]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

knn = KNeighborsClassifier() #model selection
knn.fit(X_train, y_train) #training
y_pred_knn = knn.predict(X_test) #prediction

# Evaluation
accuracy_knn = accuracy_score(y_test, y_pred_knn)
cm_knn = confusion_matrix(y_test, y_pred_knn)

print(f"Accuracy: {accuracy_knn:.4f}")
print("Confusion Matrix:")
print(cm_knn)


Accuracy: 1.0000
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## 5. Naive Bayes

In [51]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

nb = GaussianNB() #model selection
nb.fit(X_train, y_train) #training
y_pred_nb = nb.predict(X_test) #prediction

# Evaluation
accuracy_nb = accuracy_score(y_test, y_pred_nb)
cm_nb = confusion_matrix(y_test, y_pred_nb)

print(f"Accuracy: {accuracy_nb:.4f}")
print("Confusion Matrix:")
print(cm_nb)


Accuracy: 1.0000
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## 6. SVM - support vector machine

In [52]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

svm = SVC(random_state=42) #model selection
svm.fit(X_train, y_train) #training
y_pred_svm = svm.predict(X_test) #prediction

# Evaluation
accuracy_svm = accuracy_score(y_test, y_pred_svm)
cm_svm = confusion_matrix(y_test, y_pred_svm)

print(f"Accuracy: {accuracy_svm:.4f}")
print("Confusion Matrix:")
print(cm_svm)


Accuracy: 1.0000
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## 6. Neural Network

In [53]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

mlp = MLPClassifier(max_iter=1000, random_state=42) #model selection
mlp.fit(X_train, y_train) #training
y_pred_mlp = mlp.predict(X_test) #prediction

# Evaluation
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
cm_mlp = confusion_matrix(y_test, y_pred_mlp)

print(f"Accuracy: {accuracy_mlp:.4f}")
print("Confusion Matrix:")
print(cm_mlp)


Accuracy: 1.0000
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## 7. Gradient Boosting

In [54]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

gb = GradientBoostingClassifier(random_state=42) #model selection
gb.fit(X_train, y_train) #training
y_pred_gb = gb.predict(X_test) #prediction

# Evaluation
accuracy_gb = accuracy_score(y_test, y_pred_gb)
cm_gb = confusion_matrix(y_test, y_pred_gb)


print(f"Accuracy: {accuracy_gb:.4f}")
print("Confusion Matrix:")
print(cm_gb)


Accuracy: 1.0000
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
