In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

2023-11-29 22:51:12.826765: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-29 22:51:12.826800: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-29 22:51:12.827894: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-29 22:51:12.833996: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load data
data = pd.read_csv('Bike-Sharing-Dataset/day.csv', index_col=0)

# Drop 'dteday' if you decide it's redundant
data.drop('dteday', axis=1, inplace=True)

# Define categorical and numerical features
'''
season: 1,4
yr 0,1
mnth 1, 12
holiday 0,1
weekday 0,6
workingday 0,1
weathersit 1,3

'''
categorical_features = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']
numerical_features = ['temp', 'atemp', 'hum', 'windspeed']

Unit variance is a term used in statistics and data preprocessing, particularly in the context of standardizing or scaling data. When a dataset is scaled to have unit variance, it means that the variance of each feature in the dataset is scaled to be 1.

Variance measures how much the values in a dataset are spread out around the mean. When the variance is 1 (unit variance), it indicates a certain standardization of the spread of data points. 

In more technical terms, to achieve unit variance, each feature's values are transformed in such a way that the variance of the feature becomes 1. This is typically done by subtracting the mean of the feature from each data point and then dividing by the standard deviation of the feature. The formula for this standardization (also known as Z-score normalization) for a data point \( x \) is:

\[
z = \frac{x - \mu}{\sigma}
\]

Where:
- \( \mu \) is the mean of the feature.
- \( \sigma \) is the standard deviation of the feature.
- \( x \) is the original value of the data point.
- \( z \) is the standardized value.

This process transforms the data so that the mean of each feature is 0 and the standard deviation (and hence variance) is 1. This type of scaling is beneficial in many machine learning algorithms, especially those that are sensitive to the scale of the input data, like neural networks, by ensuring that all features contribute equally to the model.

In [3]:
# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        #('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Separating target variable
X = data.drop('cnt', axis=1)

# also drop 'casual' and 'registered' if you want to predict 'cnt' without knowing the breakdown
X = X.drop(['casual', 'registered'], axis=1)
y = data['cnt']


In [4]:
# examine the data
#X.head()
#y.head()
X.describe()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,2.49658,0.500684,6.519836,0.028728,2.997264,0.683995,1.395349,0.495385,0.474354,0.627894,0.190486
std,1.110807,0.500342,3.451913,0.167155,2.004787,0.465233,0.544894,0.183051,0.162961,0.142429,0.077498
min,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392
25%,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.52,0.13495
50%,3.0,1.0,7.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.626667,0.180975
75%,3.0,1.0,10.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.730209,0.233214
max,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463


season: 1,4
yr 0,1
mnth 1, 12
holiday 0,1
weekday 0,6
workingday 0,1
weathersit 1,3




In [5]:
# check data types
print(X.dtypes)
print(y.dtypes)

season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
dtype: object
int64


In [6]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Applying the transformations
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# convert from sparse array to dense array
X_train = X_train.toarray()
X_test = X_test.toarray()


In [8]:
# print dtypes of every array
print(X_train.dtype)
print(X_test.dtype)
print(y_train.dtype)
print(y_test.dtype)


float64
float64
int64
int64


In [9]:
# describe the data using describe()
pd.DataFrame(X_train).describe()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,584.0,584.0,584.0,584.0,584.0,584.0,584.0,584.0,584.0,584.0,...,584.0,584.0,584.0,584.0,584.0,584.0,584.0,584.0,584.0,584.0
mean,0.241438,0.258562,0.258562,0.241438,0.092466,0.080479,0.068493,0.082192,0.094178,0.085616,...,0.140411,0.135274,0.15411,0.130137,0.148973,0.330479,0.669521,0.628425,0.340753,0.030822
std,0.428322,0.43822,0.43822,0.428322,0.289931,0.272267,0.252807,0.274892,0.292327,0.280037,...,0.347711,0.342309,0.361363,0.336743,0.356367,0.470789,0.470789,0.48364,0.474369,0.172983
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# describe the data using describe()
pd.DataFrame(X_test).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,...,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0
mean,0.272109,0.22449,0.251701,0.251701,0.054422,0.068027,0.14966,0.081633,0.047619,0.068027,...,0.14966,0.170068,0.095238,0.190476,0.122449,0.258503,0.741497,0.653061,0.326531,0.020408
std,0.446567,0.418672,0.435474,0.435474,0.227624,0.252653,0.357957,0.27474,0.213687,0.252653,...,0.357957,0.376977,0.294547,0.394019,0.328924,0.439309,0.439309,0.477623,0.470547,0.141875
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,1.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
# write out X_train and X_test to csv files
pd.DataFrame(X_train).to_csv('X_train.csv', index=False)
pd.DataFrame(X_test).to_csv('X_test.csv', index=False)

In [12]:
# Create the model
model = Sequential()

# Add a hidden layer with ReLU activation
# The number of units/neurons in the hidden layer is a hyperparameter you can tune
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))

# Add the output layer with linear activation
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')




2023-11-29 22:51:14.211422: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-29 22:51:14.241952: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


The error message you encountered, `"TypeError: 'SparseTensor' object is not subscriptable"`, typically occurs when TensorFlow tries to manipulate a `SparseTensor` object in a way that is not supported. A `SparseTensor` is a TensorFlow data type used to represent sparse data (data with many zeros or missing values). In your context, this issue might be arising from how the data is being passed to the `model.fit()` function.

To address this issue, let's consider a few potential causes and solutions:

1. **Data Format:** Ensure that the data being fed into the model is in a format that TensorFlow can handle. Typically, data should be in the form of NumPy arrays or TensorFlow tensors. If your data is in a sparse format (like from Pandas `get_dummies` or Scikit-Learn's `OneHotEncoder`), you may need to convert it to a dense format using `.toarray()` or `.todense()`. 

2. **Preprocessing Steps:** Double-check the preprocessing steps. If there's a step that might be converting the data to a sparse format inadvertently, modify it to keep the data in a dense format. 

3. **TensorFlow Version:** Sometimes, such errors can be due to version incompatibilities. Ensure that you are using a TensorFlow version that is compatible with your code.

4. **Input Check:** Before passing the data to `model.fit()`, print out the type of `X_train` and `y_train` to verify they are not `SparseTensor` objects. If they are, convert them to a dense format.

5. **Model Architecture:** While not directly related to the error message, it's always good to ensure that the model architecture is appropriate for the task at hand. Since you are dealing with regression (predicting the 'cnt' variable), your current setup with a single linear layer seems appropriate.

6. **GPU Usage:** To ensure TensorFlow uses the GPU, you can set up your environment to use a GPU-enabled TensorFlow version (like `tensorflow-gpu`). You can check GPU usage with `tf.config.list_physical_devices('GPU')`. Also, make sure your system's GPU drivers and CUDA are correctly installed and configured.

As for the specific pip install command for Keras, you can use `pip install tensorflow` as Keras is now integrated into TensorFlow and does not require a separate installation.

If the issue persists even after checking these points, there might be something specific in the data or the way it's being handled that's causing the problem. In such a case, examining the exact format and type of data being used right before `model.fit()` could provide more clues.

In [13]:
# check types of X_train and y_train
print(type(X_train))
print(type(y_train))



<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [14]:
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7efee79da4a0>

In [15]:
# Evaluate the model
loss = model.evaluate(X_test, y_test)
print('Test loss:', loss)

Test loss: 20718110.0


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load data
data = pd.read_csv('Bike-Sharing-Dataset/day.csv', index_col=0)

# Optionally drop 'dteday' if it's redundant
data.drop('dteday', axis=1, inplace=True)

# One-hot encode categorical variables using pd.get_dummies
categorical_features = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

# Separating target variable
X = data.drop('cnt', axis=1)
y = data['cnt']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the model
model = Sequential()
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu')) # Adding a hidden layer
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32)

# Evaluate the model
loss = model.evaluate(X_test_scaled, y_test)
print('Test loss:', loss)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 22159994.0
