# Part 1

In [1]:
import pandas as pd

In [2]:
weather_hist = pd.read_csv('data/weatherHistory.csv')

In [3]:
weather_hist.rename({
    'Formatted Date': 'date',
    'Temperature (C)': 'temp',
    'Apparent Temperature (C)': 'temp_app',
    'Precip Type': 'weather_type',
    'Wind Speed (km/h)': 'wind_speed',
    'Humidity': 'humidity',
    'Daily Summary': 'summary'
},
    axis=1, inplace=True)

In [4]:
weather_hist.loc[:, 'date'] = weather_hist.date.apply(pd.to_datetime)

## The coldest day

By real temperature:

In [5]:
coldest_day = weather_hist.loc[weather_hist.temp.idxmin()]
print('The coldest day in Szeged was on',
      str(pd.to_datetime(coldest_day.date).date())
      + ', it was just', str(coldest_day.temp), 'degrees.')

The coldest day in Szeged was on 2012-02-10, it was just -21.822222222222226 degrees.


By apparent temperature:

In [6]:
coldest_day_app = weather_hist.loc[weather_hist.temp_app.idxmin()]
print('The apparent coldest day in Szeged was on',
      str(pd.to_datetime(coldest_day_app.date).date())
      + ', it was just', str(coldest_day_app.temp), 'degrees.')

The apparent coldest day in Szeged was on 2012-02-11, it was just -20.0 degrees.


## The most windy day

In [7]:
most_windy_day = weather_hist.loc[weather_hist.wind_speed.idxmax()]
print('The most windy day in Szeged was on',
      str(pd.to_datetime(most_windy_day.date).date())
      + ', with wind speed reaching', str(most_windy_day.wind_speed), 'km/h.')

The most windy day in Szeged was on 2007-01-29, with wind speed reaching 63.8526 km/h.


## How many rainy days there were during the summer of 2014

In [8]:
rainy_days = weather_hist[weather_hist.weather_type == 'rain']
rainy_days = rainy_days[[x.date.year == 2014 and x.date.month in (6, 7, 8)
                         for _, x in rainy_days.iterrows()]]
rainy_days = set(rainy_days.date.apply(lambda x: x.date()))

print(f'There were {len(rainy_days)} rainy days in Szeget during the summer of 2014.')

There were 92 rainy days in Szeget during the summer of 2014.


## groupby

In [9]:
weather_hist['year'] = weather_hist.date.apply(lambda x: x.year)
weather_hist['month'] = weather_hist.date.apply(lambda x: x.month)
weather_hist['week'] = weather_hist.date.apply(lambda x: x.week)

### The most windy month

In [10]:
weather_hist.groupby('month').agg(['mean'])

Unnamed: 0_level_0,temp,temp_app,humidity,wind_speed,Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),year,week
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1,0.81389,-1.939615,0.850723,11.512816,180.405425,7.830584,0.0,1006.125792,2011.0,4.255132
2,2.159699,-0.565343,0.8134,12.185543,188.717176,8.731368,0.0,1003.929313,2011.009646,6.948553
3,6.906599,5.074268,0.702966,13.405461,186.53848,10.91045,0.0,1001.551536,2011.0,11.180105
4,12.756417,12.141164,0.641133,11.893094,186.653157,11.784224,0.0,1009.996332,2011.0,15.539394
5,16.873692,16.732751,0.691325,10.959337,188.071237,11.892754,0.0,1003.49953,2011.0,19.894428
6,20.715617,20.742264,0.68647,9.626471,202.780808,11.990266,0.0,1001.883742,2011.0,24.254545
7,22.963943,22.976086,0.639657,9.639907,202.812248,12.18782,0.0,1008.566431,2010.9989,28.609828
8,22.345031,22.270497,0.635542,8.933431,195.3298,12.455549,0.0,1001.716944,2010.997076,33.032164
9,17.51679,17.404283,0.68879,9.621813,187.667424,11.602874,0.0,1000.565347,2011.0,37.39697
10,11.342247,10.694346,0.774554,10.000153,176.91263,9.741691,0.0,1003.243458,2011.0,41.755339


The most windy month is March.

### The most humid year

In [11]:
weather_hist.groupby('year').agg(['mean'])

Unnamed: 0_level_0,temp,temp_app,humidity,wind_speed,Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),month,week
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2006,11.215365,10.158215,0.767341,10.189852,189.342808,9.758083,0.0,992.543529,6.526826,26.573402
2007,12.135239,11.063135,0.689652,10.825392,194.119292,10.408191,0.0,1001.640226,6.526826,26.433676
2008,12.161876,11.053184,0.701237,11.303897,193.82525,10.267119,0.0,1007.734504,6.514458,26.364071
2009,12.26791,11.069337,0.707247,11.505948,180.496461,10.00137,0.0,1002.608735,6.526826,27.003425
2010,11.202061,10.098643,0.796858,11.015628,192.067509,9.14081,0.0,1004.811891,6.530852,26.593693
2011,11.524453,10.584294,0.736017,9.898262,189.342352,9.507929,0.0,1014.184075,6.526826,26.573402
2012,11.986726,10.753081,0.6895,11.264545,187.755692,10.363108,0.0,999.341481,6.514458,26.503529
2013,11.940719,10.82532,0.754209,10.969389,187.861758,10.892049,0.0,1004.950764,6.526826,26.433562
2014,12.529737,11.628703,0.748578,10.502473,178.226219,11.320806,0.0,987.394676,6.526664,26.432682
2015,12.31137,11.340911,0.732355,10.735247,184.098174,10.912786,0.0,1005.179401,6.526826,27.003425


The most humid year is 2010.

### The week with the most unstable weather

In [12]:
week_range_data = weather_hist[['temp', 'temp_app', 'wind_speed', 'week']].\
    groupby('week').agg(['max', 'min'])
weather_range = pd.Series([0.] * (len(week_range_data)))
weather_range.index += 1
week_range_data['range'] = weather_range

In [13]:
for feature in ['temp', 'temp_app', 'wind_speed']:
    feature_range = week_range_data[feature]['max'] - week_range_data[feature]['min']
    mean = feature_range.mean()
    std = feature_range.std()
    week_range_data['range'] += (feature_range - mean) / std

In [14]:
week_range_data.head()

Unnamed: 0_level_0,temp,temp,temp_app,temp_app,wind_speed,wind_speed,range
Unnamed: 0_level_1,max,min,max,min,max,min,Unnamed: 7_level_1
week,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,12.838889,-13.255556,12.838889,-19.472222,42.7938,0.0,-0.393012
2,15.738889,-15.0,15.738889,-18.316667,44.6292,0.0,1.949189
3,13.888889,-11.977778,13.888889,-19.961111,46.8349,0.0,0.559251
4,14.772222,-14.088889,14.772222,-22.094444,55.9314,0.0,3.804308
5,14.927778,-17.777778,14.927778,-23.872222,63.8526,0.0,6.882768


In [15]:
week_range_data.range.idxmax()

5

In [16]:
week_range_data.loc[week_range_data.range.idxmax()].range

    6.882768
Name: 5, dtype: float64

The week with the most unstable weather by the unstability of real temperature, apparent temperature, and wind speed is the fifth week of the year (end of January—beginning of February).

# Part 2

In [17]:
import re
import numpy as np
from scipy.sparse import hstack

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, SGDRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Manual text feature extraction

In [18]:
data = pd.read_csv('data/data.csv')

In [19]:
# Old features.

data['text_len'] = data['description'].apply(len)
data['text_len_tokens'] = data['description'].apply(lambda x: len(x.split()))
data['number_of_caps'] = data['description'].apply(lambda x: len([ch for ch in x if ch.isupper()]))
data['number_of_nonalphanum'] = data['description'].apply(lambda x: len([ch for ch in x if not ch.isalnum()]))

In [20]:
# New features.

# Number of non-alphabetic characters. Digits can make an ad more concrete and therefore attractive.
data['number_of_nonalpha'] = data['description'].apply(lambda x: len([ch for ch in x if not ch.isalpha()]))

# Number of latin characters or tokens. These are, as a rule, brand/model names.
# Like digits, they can make an ad more concrete and therefore attractive.
data['number_of_latin'] = data.description.apply(
    lambda x: len([ch for ch in x if re.search(r'[a-z]', ch.lower()) is not None]))
data['number_of_latin_tokens'] = data.description.apply(
    lambda x: len([token for token in x.split() if re.search(r'^[a-z]+$', token.lower()) is not None]))

# Average token length. It correlates with the overall text readability.
data['mean_token_len'] = data.description.apply(lambda x: np.mean([len(token) for token in x.split()]))

# Title length in characters and tokens.
# Too long titles may be skipped over out of laziness, while too short ones do not cause interest.
data['title_len'] = data.title.apply(len)
data['title_len_tokens'] = data.title.apply(lambda x: len(x.split()))

In [21]:
X = data[['text_len', 'text_len_tokens', 'number_of_caps', 'number_of_nonalphanum',
          'number_of_nonalpha', 'number_of_latin', 'number_of_latin_tokens',
          'mean_token_len', 'title_len', 'title_len_tokens']].values
y = data.deal_probability.values

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [23]:
regressor = ElasticNet(alpha=1)
regressor.fit(X_train, y_train)
preds = regressor.predict(X_test)
np.sqrt(mean_squared_error(y_test, preds))

0.26367317664827983

Strictly speaking, our metric did grow by 0.03 percentage points.

## CountVectorizer + TfidfVectorizer

### 1. TfidfVectorizer: more n-grams

In [24]:
cv = CountVectorizer(max_features=1000, min_df=5, max_df=0.4, ngram_range=(1, 2))
tfidf = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.4, analyzer='char', ngram_range=(3, 4))

X_cv = cv.fit_transform(data.description)
X_tfidf = tfidf.fit_transform(data.description)

X = hstack([X_cv, X_tfidf])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [26]:
regressor = SGDRegressor(max_iter=50, alpha=0.00001)
regressor.fit(X_train, y_train)
preds = regressor.predict(X_test)
np.sqrt(mean_squared_error(y_test, preds))

0.2500328456482447

### 2. SGDRegressor: bigger alpha

In [27]:
regressor = SGDRegressor(max_iter=50, alpha=1000)
regressor.fit(X_train, y_train)
preds = regressor.predict(X_test)
np.sqrt(mean_squared_error(y_test, preds))

0.26372767690265503

We managed to surpass, albeit not by much, the manually extracted features.

### 3. max_df decrease: now frequent words are ignored more often

In [28]:
cv = CountVectorizer(max_features=1000, min_df=5, max_df=0.25, ngram_range=(1, 2))
tfidf = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.25, analyzer='char', ngram_range=(3, 4))

X_cv = cv.fit_transform(data.description)
X_tfidf = tfidf.fit_transform(data.description)

X = hstack([X_cv, X_tfidf])

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [30]:
regressor = SGDRegressor(max_iter=50, alpha=1000)
regressor.fit(X_train, y_train)
preds = regressor.predict(X_test)
np.sqrt(mean_squared_error(y_test, preds))

0.26371245360890216

Here we can no longer see an improvement.