# Factor Analysis: 
The dataset you are going to use in this practice is the famous Iris data set. The dataset consists of 150 records of Iris plant with four features: “sepal-length”, “sepal-width”, “petal-length”, and “petal-width”. All the features are numeric. The records have been classified into one of the three classes, that is, “Iris-setosa”, “Iris-versicolor”, or “Iris-verginica”.

## Objective

- Train the models on original number of features
- Reduce the number of variables by merging correlated variables
- Extract the most important features from the dataset that are responsible for maximum variance in the output.

In [1]:
# Import required libraries
import pandas as pd
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [3]:
iris.DESCR



In [4]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [5]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [7]:
iris.data.shape

(150, 4)

In [8]:
# import train_test_split
from sklearn.model_selection import train_test_split

In [9]:
X = iris.data
X.shape

(150, 4)

In [10]:
Y = iris.target
Y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2,random_state = 1)

In [12]:
x_train.shape

(120, 4)

In [13]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [14]:
logreg.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
y_pred = logreg.predict(x_test)

In [16]:
from sklearn import metrics
acc = metrics.accuracy_score(y_pred, y_test)
acc

0.9666666666666667

In [17]:
# PCA transformation
from sklearn.decomposition import PCA

In [18]:
pca = PCA(n_components=0.95)

In [19]:
pca.fit(x_train)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [20]:
x_train_trans = pca.transform(x_train)

In [21]:
x_train_trans.shape

(120, 2)

In [22]:
x_test_trans = pca.transform(x_test)

In [23]:
x_test_trans.shape

(30, 2)

In [24]:
x_test_trans

array([[-2.68738967,  1.24153895],
       [-0.96538559, -0.70916231],
       [ 0.84875981,  0.36063236],
       [-2.667761  ,  0.87375109],
       [ 3.18881499,  1.39096721],
       [ 1.04563026,  0.31741614],
       [ 1.87475727,  0.42737639],
       [-2.25684139,  0.50141011],
       [-2.68626909, -0.12972275],
       [ 2.37996557,  0.39664779],
       [ 0.29972026, -0.46441361],
       [-2.32593534,  0.80416676],
       [ 2.56639731,  0.3617593 ],
       [ 0.88247689,  0.35216014],
       [ 0.7592687 , -0.12787235],
       [-3.05198739, -0.2718511 ],
       [ 0.17301658, -0.3648974 ],
       [ 0.60593977, -0.31331324],
       [-2.45919725,  0.47071942],
       [-2.91874569,  0.13417817],
       [ 0.40549318, -0.62879833],
       [ 0.5330404 , -0.44285763],
       [ 1.50660921,  0.29454543],
       [-2.81965942,  0.32889825],
       [ 2.56458182,  0.58291385],
       [ 0.24318335, -0.30859743],
       [-2.64088627,  1.159726  ],
       [-2.63488745,  0.57982198],
       [ 0.86550017,

In [25]:
# Fit the Logistic Ligression over transformed dataset
logreg = LogisticRegression()

In [26]:
logreg.fit(x_train_trans, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
y_pred = logreg.predict(x_test_trans)

In [28]:
acc = metrics.accuracy_score(y_pred, y_test)
acc

0.9666666666666667