In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.datasets import make_regression

np.random.seed(42)
random.seed(42)

In [2]:
n_features = 4
# create features and targets
X, y = make_regression(
    n_samples=100,
    n_features=n_features,
    n_informative=n_features,
    n_targets=1
)

df = pd.DataFrame(X, columns=[f"feature{i+1}" for i in range(n_features)])
df["target"] = y

df.head()

Unnamed: 0,feature1,feature2,feature3,feature4,target
0,-1.448084,-1.407464,0.23205,-0.471038,-143.065368
1,-1.918771,-0.026514,-0.074446,0.25755,-4.461594
2,0.005113,-0.234587,0.261055,0.29612,-0.835365
3,-0.485364,0.081874,-0.236819,-0.772825,-34.189506
4,0.02451,0.497998,-0.77301,0.097676,26.118365


In [3]:
df['feature1'] = df['feature1'].apply(lambda x: round(x*100+500, 2)) # std 100, mean 500
df['feature2'] = df['feature2'].apply(lambda x: 'red' if x >0.5 else ('blue' if x < -0.5 else 'green'))
df['feature3'] = df['feature3'].apply(lambda x: round(x*-50-20, 3)) # std 50, mean -20
df['feature4'] = df['feature4'].apply(lambda x: round(x*10+5, 0)) # std 10, mean 5

df.head()

Unnamed: 0,feature1,feature2,feature3,feature4,target
0,355.19,blue,-31.602,0.0,-143.065368
1,308.12,green,-16.278,8.0,-4.461594
2,500.51,green,-33.053,8.0,-0.835365
3,451.46,green,-8.159,-3.0,-34.189506
4,502.45,green,18.65,6.0,26.118365


Add a misspelling of red with Red.

In [4]:
df.columns

Index(['feature1', 'feature2', 'feature3', 'feature4', 'target'], dtype='object')

In [5]:
df['target']

0    -143.065368
1      -4.461594
2      -0.835365
3     -34.189506
4      26.118365
         ...    
95    -47.525972
96   -160.666558
97    -16.351977
98     82.955898
99    196.616246
Name: target, Length: 100, dtype: float64

In [6]:
# randomly misspell from on the instances of red with Red
df['feature2'] = df['feature2'].apply(lambda x: 'Red' if (x=='red' and random.randint(0,1) == 0) else x)

# Randomly replace 10% of the values in each column with NaN
for col in df.columns:
    df.loc[df.sample(frac=0.1).index, col] = np.nan

In [7]:
df.describe()

Unnamed: 0,feature1,feature3,feature4,target
count,90.0,90.0,90.0,90.0
mean,501.527,-22.472467,5.122222,1.764505
std,106.89865,48.426015,8.902503,96.056905
min,175.87,-212.637,-15.0,-175.916416
25%,435.3225,-47.81125,-2.0,-69.474014
50%,506.59,-18.768,5.0,4.97899
75%,578.125,8.636,10.75,70.712312
max,718.98,77.984,28.0,274.007092


In [8]:
df.to_csv('./data/data-regression.csv', index=False)