# Libs

In [2]:
import urllib.request

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit


# Data

In [None]:
url_lifesat = 'https://github.com/marcinsawinski/UEP_KIE_ML_LAB_PROG/raw/main/datasets/lifesat/lifesat.csv'

# Basic operations

In [None]:
# Fetch file from url 
urllib.request.urlretrieve(url, filename)

## Pandas

In [None]:
# Read remote or local csv into a pandas dataframe
df = pd.read_csv('file or url address')

# Pick specific columns
df[['column1','column2']]

# Drop column for many dataframes
for set_ in (df1, df2):
    set_.drop(columns="column" inplace=True)

#Pick rows that fill specifi criteria e.g. rows where column1 is equal 100
df[df.column1 == 100]

# Convert pandas to numpy array
df.values 
#or
df.to_numpy()

# Visualize dataframe as scatter plot
df.plot(kind='scatter', grid=True,
             x="column1", y="column2")

# Visualize dataframe as scatter plot extra
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=1, 
s=housing["col1"] ,c="col3",cmap="jet", figsize=(10, 7),
legend=True, colorbar=True,label="col3")


# Merge dataframes df_a and df_b into df_c using matching indexes
df_c = pd.merge(left=df_a, right=df_b, left_index=True, right_index=True)

# Merge dataframes df_a and df_b into df_c using key column column1
df_c = pd.merge(left=df_a, right=df_b, on='column1')

#Preview dataset
df.head()
df.info()
df["column1"].value_counts()
df.describe()
df.hist(bins=50, figsize=(12, 8))
df.isna().sum()

# Correlation table
df[['col1','clo2']].corr()

# Correlation list
corr_matrix = df[['col1','clo2']].corr()
corr_matrix["col1"].sort_values(ascending=False)

# Correlation plot
df.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1, grid=True)

# Scatter matrix
scatter_matrix(df[['col1','clo2']], figsize=(12, 8))

# median values
df.median().values

# Generate categories
df["category_col"] = pd.cut(df["numeric_col"],
                               bins=[0, 1.5, 3, 4.5, 6, np.inf],
                               labels=[1, 2, 3, 4, 5])
# Value counts
df["category_col"].value_counts()

# Bins size
df["category_col"].value_counts() / len(df)

# Visualize
df["category_col"].value_counts().sort_index().plot.bar(rot=0)

## scikit-learn

In [None]:
# Create Linear Regression Model
model = LinearRegression()

# Create Regression Model based on k-nearest neighbors with k=5
model = KNeighborsRegressor(n_neighbors=5)

# Fit model with independant variables a and dependant variable b
model.fit(a, b)

# Predict using  model for one new values (one element matrix with value 100
x_n = [[100]]
model.predict(x_n)

# Random train /test split
train_set, test_set = train_test_split(df, test_size=0.2)

# Stratified train /test split
strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2, stratify=df["category_col"])
# or
for train_index, test_index in splitter.split(df, df["category_col"]):
    strat_train_set_n = df.iloc[train_index]
    strat_test_set_n = df.iloc[test_index]
    strat_splits.append([strat_train_set_n, strat_test_set_n])
strat_train_set, strat_test_set = strat_splits[0]


# Input missing data (median)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing_num)

# Check values
imputer.statistics_
