In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pkg_resources
from scipy.stats import linregress
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import types

### Package requirements for reproducibility

In [None]:
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

#### Define the initial dataset you'll be working on

In [None]:
SatelliteJuly = pd.read_excel('Data.xlsx',sheet_name='July_sat')
Seed = pd.read_excel('Data.xlsx',sheet_name='Chem_comp_wheat')
Dough = pd.read_excel('Data.xlsx',sheet_name='Dough',usecols=['W','P/L'])
Bread = pd.read_excel('Data.xlsx',sheet_name='Bread',usecols=[2,3,4,5,6,7,8,9,10])

#### The figures get only normalised for the sake of convenience

In [None]:
scaler = StandardScaler()
scaler.fit(SatelliteJuly)
scaledSatellite = pd.DataFrame(scaler.transform(SatelliteJuly))

In [None]:
#### The PCA is eventually run

In [None]:
pca = PCA(.99)
pca.fit(scaledSatellite)
PCA_Satellite = pd.DataFrame(pca.transform(scaledSatellite))
pca.n_components_
pca.explained_variance_ratio_.sum()

#### Let us now run the scatter plot of the PCA against the wheat composition

In [None]:
colors = ['b','g','r']
for c1 in Seed:
    c0 = -1
    for c in PCA_Satellite:
        c0+=1
        plt.scatter(PCA_Satellite[c],Seed[c1],color=colors[c0],s=1,label=str(c1))
        plt.plot(np.unique(PCA_Satellite[c]),np.poly1d(np.polyfit(PCA_Satellite[c], Seed[c1], 1))
                 (np.unique(PCA_Satellite[c])),color=colors[c0],label='R2='+str(linregress(PCA_Satellite[c],Seed[c1])[2]**2))
    plt.legend()
    plt.show()

In [None]:
colors = ['b','g','r']
for c1 in Dough:
    c0 = -1
    for c in PCA_Satellite:
        c0+=1
        plt.scatter(PCA_Satellite[c],Dough[c1],color=colors[c0],s=1,label=str(c1))
        plt.plot(np.unique(PCA_Satellite[c]),np.poly1d(np.polyfit(PCA_Satellite[c], Dough[c1], 1))
                 (np.unique(PCA_Satellite[c])),color=colors[c0],label='R2='+str(linregress(PCA_Satellite[c],Dough[c1])[2]**2))
    plt.legend()
    plt.show()

In [None]:
colors = ['b','g','r']
for c1 in Bread:
    c0 = -1
    for c in PCA_Satellite:
        c0+=1
        plt.scatter(PCA_Satellite[c],Bread[c1],color=colors[c0],s=1,label=str(c1))
        plt.plot(np.unique(PCA_Satellite[c]),np.poly1d(np.polyfit(PCA_Satellite[c], Bread[c1], 1))
                 (np.unique(PCA_Satellite[c])),color=colors[c0],label='R2='+str(linregress(PCA_Satellite[c],Bread[c1])[2]**2))
    plt.legend()
    plt.show()

In [None]:
for c1 in Seed:
    for c in SatelliteJuly:
        plt.scatter(SatelliteJuly[c],Seed[c1],s=1,label=str(c)[:-7]+'_'+str(c1))
        plt.plot(np.unique(SatelliteJuly[c]),np.poly1d(np.polyfit(SatelliteJuly[c], Seed[c1], 1))
                 (np.unique(SatelliteJuly[c])),label='R2='+str(linregress(SatelliteJuly[c],Seed[c1])[2]**2))
    plt.legend()
    plt.show()

In [None]:
for c1 in Dough:
    for c in SatelliteJuly:
        plt.scatter(SatelliteJuly[c],Dough[c1],s=1,label=str(c)[:-7]+'_'+str(c1))
        plt.plot(np.unique(SatelliteJuly[c]),np.poly1d(np.polyfit(SatelliteJuly[c], Dough[c1], 1))
                 (np.unique(SatelliteJuly[c])),label='R2='+str(linregress(SatelliteJuly[c],Dough[c1])[2]**2))
    plt.legend()
    plt.show()

In [None]:
for c1 in Bread:
    for c in SatelliteJuly:
        plt.scatter(SatelliteJuly[c],Bread[c1],s=1,label=str(c)[:-7]+'_'+str(c1))
        plt.plot(np.unique(SatelliteJuly[c]),np.poly1d(np.polyfit(SatelliteJuly[c],Bread[c1], 1))
                 (np.unique(SatelliteJuly[c])),label='R2='+str(linregress(SatelliteJuly[c],Bread[c1])[2]**2))
    plt.legend()
    plt.show()

In [None]:
pca.explained_variance_ratio_/pca.explained_variance_ratio_.sum()