In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Load your camera dataset
data = pd.read_csv("camera_dataset.csv")
print(data.head())

# Let's first explore what we might want to predict
print("\nDataset info:")
print(data.info())
print("\nColumn names:")
print(data.columns.tolist())

# For demonstration, let's create a binary classification target
# We'll predict if a camera is "Expensive" (price > $500) or not
data['is_expensive'] = (data['Price'] > 500).astype(int)

# Handle missing values - fill with mean for numerical columns, mode for categorical
# First, let's check which columns have missing values
print("\nMissing values:")
print(data.isnull().sum())

# Fill missing values
for column in data.columns:
    if data[column].dtype == 'object':
        # For categorical columns, fill with mode
        data[column].fillna(data[column].mode()[0], inplace=True)
    else:
        # For numerical columns, fill with mean
        data[column].fillna(data[column].mean(), inplace=True)

# Encode categorical variables if needed
# For this dataset, the main categorical column is 'Model' which has many unique values
# We might want to extract brand information instead
data['Brand'] = data['Model'].apply(lambda x: x.split()[0] if pd.notnull(x) else 'Unknown')

# Encode the brand column
le = LabelEncoder()
data['Brand_encoded'] = le.fit_transform(data['Brand'])

# Select features for the model
# Let's use numerical features that might predict price category
feature_columns = [
    'Max resolution', 'Low resolution', 'Effective pixels',
    'Zoom wide (W)', 'Zoom tele (T)', 'Normal focus range',
    'Macro focus range', 'Storage included', 'Weight (inc. batteries)',
    'Dimensions', 'Brand_encoded'
]

X = data[feature_columns]
y = data['is_expensive']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

# Make predictions
y_pred = lr.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Show some sample predictions
print("\nSample predictions:")
sample_results = pd.DataFrame({
    'Actual': y_test[:10].values,
    'Predicted': y_pred[:10],
    'Correct': y_test[:10].values == y_pred[:10]
})
print(sample_results)

# Show feature importance
feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Coefficient': lr.coef_[0]
}).sort_values('Coefficient', key=abs, ascending=False)

print("\nFeature importance (absolute coefficients):")
print(feature_importance)

                    Model  Release date  Max resolution  Low resolution  \
0        Agfa ePhoto 1280          1997          1024.0           640.0   
1        Agfa ePhoto 1680          1998          1280.0           640.0   
2        Agfa ePhoto CL18          2000           640.0             0.0   
3        Agfa ePhoto CL30          1999          1152.0           640.0   
4  Agfa ePhoto CL30 Clik!          1999          1152.0           640.0   

   Effective pixels  Zoom wide (W)  Zoom tele (T)  Normal focus range  \
0               0.0           38.0          114.0                70.0   
1               1.0           38.0          114.0                50.0   
2               0.0           45.0           45.0                 0.0   
3               0.0           35.0           35.0                 0.0   
4               0.0           43.0           43.0                50.0   

   Macro focus range  Storage included  Weight (inc. batteries)  Dimensions  \
0               40.0           

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mean(), inplace=True)



Model Accuracy: 0.9231

Sample predictions:
   Actual  Predicted  Correct
0       0          0     True
1       0          0     True
2       0          0     True
3       0          0     True
4       0          0     True
5       1          0    False
6       0          0     True
7       0          0     True
8       0          0     True
9       0          0     True

Feature importance (absolute coefficients):
                    Feature  Coefficient
2          Effective pixels    -0.333181
10            Brand_encoded     0.111597
3             Zoom wide (W)    -0.095824
6         Macro focus range     0.023576
7          Storage included    -0.004045
9                Dimensions    -0.002353
5        Normal focus range    -0.002167
4             Zoom tele (T)    -0.001291
1            Low resolution     0.001236
8   Weight (inc. batteries)     0.001150
0            Max resolution    -0.000317


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
