In [1]:
import pandas as pd

In [2]:
# reading the data
cookies = pd.read_csv("../Data/cookies.csv")

In [3]:
# drop unnecessary columns or rows with NAs
from cleaning_data import dropping_rows_and_columns


cookies = dropping_rows_and_columns(cookies)

In [4]:
# defining X and y
X = cookies.drop(columns="quality")
y = cookies["quality"]

In [5]:
# splitting the data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

# training datasets to DataFrame again to manipulate them
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [6]:
# imputing the numeric nulls
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from fill_num_nulls import fill_num_na
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

numeric_nulls_imputer = FunctionTransformer(fill_num_na)

In [7]:
from mixin_imputer import mixin_fixer

mixin_encoder = FunctionTransformer(mixin_fixer)

In [8]:
# creating the category encoding pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
cat_cols = ["butter type", "mixins"]
categ_encode_pipeline = ColumnTransformer([
    ("oneH_encoder_branch", OneHotEncoder(handle_unknown="ignore"), ["butter type"]),
    ("manual_encoder_branch", mixin_encoder, ["mixins"])
])


In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [10]:
imputer_encoder_pipeline = ColumnTransformer([
    ("num_imputer", numeric_nulls_imputer, num_cols),
    ("cat_full_pipe", categ_encode_pipeline, cat_cols)
])

In [11]:
scaler = FunctionTransformer(StandardScaler)

full_pipeline = Pipeline([
    ("impute_pipeline", imputer_encoder_pipeline),
    ("std_scaler", scaler)
    
])

In [12]:
X_train_preproc = full_pipeline.fit_transform(X_train)

 [3.40e-01 1.00e+00 3.70e+02 ... 1.00e+00 1.00e+00 0.00e+00]
 [3.00e-01 7.40e+00 5.00e+02 ... 1.00e+00 1.00e+00 0.00e+00]
 ...
 [1.90e-01 6.60e+00 4.50e+02 ... 0.00e+00 1.00e+00 0.00e+00]
 [1.70e-01 2.80e+00 4.00e+02 ... 0.00e+00 0.00e+00 0.00e+00]
 [2.40e-01 6.90e+00 6.90e+02 ... 0.00e+00 0.00e+00 0.00e+00]] as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [13]:
X_train_preproc

StandardScaler(copy=array([[3.40e-01, 1.37e+01, 4.40e+02, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       [3.40e-01, 1.00e+00, 3.70e+02, ..., 1.00e+00, 1.00e+00, 0.00e+00],
       [3.00e-01, 7.40e+00, 5.00e+02, ..., 1.00e+00, 1.00e+00, 0.00e+00],
       ...,
       [1.90e-01, 6.60e+00, 4.50e+02, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       [1.70e-01, 2.80e+00, 4.00e+02, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       [2.40e-01, 6.90e+00, 6.90e+02, ..., 0.00e+00, 0.00e+00, 0.00e+00]]))

In [14]:
from sklearn.linear_model import LinearRegression
y_train
reg = LinearRegression()
reg.fit(X_train_preproc, y_train)

ValueError: Expected 2D array, got scalar array instead:
array=StandardScaler(copy=array([[3.40e-01, 1.37e+01, 4.40e+02, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       [3.40e-01, 1.00e+00, 3.70e+02, ..., 1.00e+00, 1.00e+00, 0.00e+00],
       [3.00e-01, 7.40e+00, 5.00e+02, ..., 1.00e+00, 1.00e+00, 0.00e+00],
       ...,
       [1.90e-01, 6.60e+00, 4.50e+02, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       [1.70e-01, 2.80e+00, 4.00e+02, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       [2.40e-01, 6.90e+00, 6.90e+02, ..., 0.00e+00, 0.00e+00, 0.00e+00]])).
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [15]:
type(y_train)

pandas.core.series.Series