# Demonstrating sklearn Modin Interoperability

### Logistic Regression example taken / adapted from https://www.ritchieng.com/pandas-scikit-learn/

In [1]:
import modin.pandas as pd
import numpy as np


In [2]:
# From https://www.ritchieng.com/pandas-scikit-learn/

url = 'http://bit.ly/kaggletrain'
train = pd.read_csv(url)


    import ray
    ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})

2023-01-03 11:03:39,350	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Pclass: passenger class
# Parch: parents and children
feature_cols = ['Pclass', 'Parch']

In [5]:
# you want all rows, and the feature_cols' columns
X = train.loc[:, feature_cols]

In [6]:
# now we want to create our response vector
y = train.Survived

In [7]:
# 1. import
from sklearn.linear_model import LogisticRegression

# 2. instantiate model
logreg = LogisticRegression()

# 3. fit 
logreg.fit(X, y)

LogisticRegression()

In [8]:
url_test = 'http://bit.ly/kaggletest'
test = pd.read_csv(url_test)

In [9]:
# missing Survived column because we are predicting
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [10]:
X_new = test.loc[:, feature_cols]

In [11]:
# 4. predict
new_pred_class = logreg.predict(X_new)

In [12]:
# kaggle wants 2 columns
# new_pred_class
# PassengerId

# pandas would align them next to each other
# to ensure the first column is PassengerId, use .set_index
kaggle_data = pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).set_index('PassengerId')
kaggle_data.to_csv('sub.csv')



In [13]:
# save train data to disk using pickle
train.to_pickle('train.pkl')

Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.


In [14]:
# read data
pd.read_pickle('train.pkl')



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [15]:
# From https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
ct = ColumnTransformer(
    [("norm1", Normalizer(norm='l1'), [0, 1]),
     ("norm2", Normalizer(norm='l1'), slice(2, 4))])
X = pd.DataFrame(np.array([[0., 1., 2., 2.],
              [1., 1., 0., 1.]]))
# Normalizer scales each row of X to unit norm. A separate scaling
# is applied for the two first and two last elements of each
# row independently.
ct.fit_transform(X)



array([[0. , 1. , 0.5, 0.5],
       [0.5, 0.5, 0. , 1. ]])

In [16]:
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import MinMaxScaler
X = pd.DataFrame({
    "documents": ["First item", "second one here", "Is this the last?"],
    "width": [3, 4, 5],
})  
ct = ColumnTransformer(
    [("text_preprocess", FeatureHasher(input_type="string"), "documents"),
     ("num_preprocess", MinMaxScaler(), ["width"])])
X_trans = ct.fit_transform(X)



In [17]:
# From https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(pd.DataFrame([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]))

X = pd.DataFrame([[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]])
print(imp_mean.transform(X))

[[ 7.   2.   3. ]
 [ 4.   3.5  6. ]
 [10.   3.5  9. ]]




In [18]:
# From https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

import numpy as np
from sklearn.model_selection import train_test_split
X, y = pd.DataFrame(np.arange(10).reshape((5, 2))), pd.Series(range(5))
X
list(y)



[0, 1, 2, 3, 4]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [20]:
type(X_train)

modin.pandas.dataframe.DataFrame

In [22]:
train_test_split(y, shuffle=False)

[0    0
 1    1
 2    2
 dtype: int64,
 3    3
 4    4
 dtype: int64]

### Linear Regression example taken / adapted from https://github.com/chendaniely/2021-07-13-scipy-pandas/blob/main/05-models.ipynb

In [23]:
import seaborn as sns

In [24]:
tips = sns.load_dataset("tips")
tips = pd.DataFrame(tips)



In [25]:
pd.get_dummies(tips, drop_first=True)

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,1,1,0,0,1,1
1,10.34,1.66,3,0,1,0,0,1,1
2,21.01,3.50,3,0,1,0,0,1,1
3,23.68,3.31,2,0,1,0,0,1,1
4,24.59,3.61,4,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,0,1,0,1,0,1
240,27.18,2.00,2,1,0,0,1,0,1
241,22.67,2.00,2,0,0,0,1,0,1
242,17.82,1.75,2,0,1,0,1,0,1


In [26]:
from sklearn import linear_model

In [27]:
# 1. create the model object
lr = linear_model.LinearRegression()

In [28]:
# 2. fit the model object
lr.fit(X=tips[["total_bill", "size"]], y=tips["tip"])

LinearRegression()

In [29]:
# look at the coefficients
lr.coef_

array([0.09271334, 0.19259779])

In [30]:
# look at the intercept
lr.intercept_

0.6689447408125027

In [31]:
tips_dummy = pd.get_dummies(tips, drop_first=True)[["tip", "total_bill", "smoker_No"]]
tips_dummy

Unnamed: 0,tip,total_bill,smoker_No
0,1.01,16.99,1
1,1.66,10.34,1
2,3.50,21.01,1
3,3.31,23.68,1
4,3.61,24.59,1
...,...,...,...
239,5.92,29.03,1
240,2.00,27.18,0
241,2.00,22.67,0
242,1.75,17.82,1


In [32]:
lr2 = linear_model.LinearRegression()
lr2.fit(X=tips_dummy.iloc[:, 1:], y=tips_dummy["tip"])

LinearRegression()

In [33]:
lr2.coef_, lr2.intercept_

(array([0.10572239, 0.14892431]), 0.8142993000217928)

In [34]:
new_data = tips_dummy[["total_bill", "smoker_No"]].tail() # not really new data
new_data

Unnamed: 0,total_bill,smoker_No
239,29.03,1
240,27.18,0
241,22.67,0
242,17.82,1
243,18.78,1


In [35]:
# use the model to give predicted tip values
new_data["predicted_tips"] = lr2.predict(new_data)

In [36]:
new_data

Unnamed: 0,total_bill,smoker_No,predicted_tips
239,29.03,1,4.032345
240,27.18,0,3.687834
241,22.67,0,3.211026
242,17.82,1,2.847197
243,18.78,1,2.94869


In [38]:
type(new_data)

modin.pandas.dataframe.DataFrame