In [26]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [29]:
dataset = pd.read_csv('towns.csv', delimiter=" ")

In [30]:
def predict(X, Y):
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
  model = LinearRegression().fit(X_train, Y_train)
  print('Predictions: ', model.predict(X_test))
  print('Score: ', model.score(X_test, Y_test))
  print('Coefficients: ', model.coef_)

## The average temperature of a city based on the temperatures in the 12 months 

In [33]:
dataset['Avg'] = dataset.mean(axis = 1)

dataset.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,#town,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Avg
0,Tallinn,-3,-5,-1,3,10,13,16,15,10,6,1,-2,5.25
1,Beijing,-3,0,6,13,20,24,26,25,20,13,5,-1,12.333333
2,Berlin,0,-1,4,7,12,16,18,17,14,9,4,1,8.416667
3,Buenos_Aires,23,22,20,16,13,10,10,11,13,16,18,22,16.166667
4,Cairo,13,15,17,21,25,27,28,27,26,23,19,15,21.333333


In [34]:
Y = dataset['Avg']
X = dataset[['Jan',	'Feb','Mar',	'Apr',	'May',	'Jun',	'Jul',	'Aug',	'Sep',	'Oct',	'Nov',	'Dec']]

predict(X,Y)

Predictions:  [ 8.41542859 15.0807669  10.75093218 16.67914562 16.17910786]
Score:  0.9999939733523201
Coefficients:  [0.08473247 0.08477325 0.08030895 0.08544105 0.08317926 0.08453063
 0.07815772 0.08567841 0.083712   0.08479601 0.08274031 0.08124988]


### Conclusions

The model successfully understands that in order to predict the average, all months should have the same weight, thus having very good (almost perfect) results for this linear regression.

## The maximum temperature of a city based on the temperatures in the 12 months

In [35]:
dataset.drop('Avg', axis=1)
dataset['Max'] = dataset.max(axis = 1)

dataset.head()

  


Unnamed: 0,#town,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Avg,Max
0,Tallinn,-3,-5,-1,3,10,13,16,15,10,6,1,-2,5.25,16.0
1,Beijing,-3,0,6,13,20,24,26,25,20,13,5,-1,12.333333,26.0
2,Berlin,0,-1,4,7,12,16,18,17,14,9,4,1,8.416667,18.0
3,Buenos_Aires,23,22,20,16,13,10,10,11,13,16,18,22,16.166667,23.0
4,Cairo,13,15,17,21,25,27,28,27,26,23,19,15,21.333333,28.0


In [36]:
Y = dataset['Max']
X = dataset[['Jan',	'Feb','Mar',	'Apr',	'May',	'Jun',	'Jul',	'Aug',	'Sep',	'Oct',	'Nov',	'Dec']]

predict(X, Y)

Predictions:  [15.47029112 23.94034508 21.09833524 17.93882529 16.75788308]
Score:  0.6737029078740693
Coefficients:  [ 1.32093702 -0.53194085 -0.24074783  1.9401224  -0.05113837 -1.72969121
  1.42572254 -2.86109468  0.68022168  3.77943202 -0.88395022 -2.82584188]


**What happens if you remove the three south most cities?**

In [37]:
dataset.drop(dataset[dataset['#town'] =='Buenos_Aires'].index, inplace = True)
dataset.drop(dataset[dataset['#town'] =='Singapore'].index, inplace = True)
dataset.drop(dataset[dataset['#town'] =='Cape_Town'].index, inplace = True)

dataset.head()

Unnamed: 0,#town,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Avg,Max
0,Tallinn,-3,-5,-1,3,10,13,16,15,10,6,1,-2,5.25,16.0
1,Beijing,-3,0,6,13,20,24,26,25,20,13,5,-1,12.333333,26.0
2,Berlin,0,-1,4,7,12,16,18,17,14,9,4,1,8.416667,18.0
4,Cairo,13,15,17,21,25,27,28,27,26,23,19,15,21.333333,28.0
6,Helsinki,-5,-6,-2,3,10,13,16,15,10,5,0,-3,4.666667,16.0


In [38]:
Y = dataset['Max']
X = dataset[['Jan',	'Feb','Mar',	'Apr',	'May',	'Jun',	'Jul',	'Aug',	'Sep',	'Oct',	'Nov',	'Dec']]

predict(X, Y)

Predictions:  [25.73317855 27.52207751 22.86465371 17.01005506]
Score:  0.9953909664767504
Coefficients:  [ 0.04387383  0.08378287 -0.07720621  0.03978835  0.04540257 -0.01979893
  0.72577614  0.03951036  0.12027907  0.16615429 -0.09028301 -0.1701956 ]


### Conclusions

In order to predict the max temperature, the model should learn that for the northern hemisphere, the months of June and July probably should have more weight than the winter ones. In the first model, as we have some cities from the southern hemisphere, it can not differenciate between the two, so the score is not that good, but as soon as we remove the 3 cities from the southern hemisphere, the score goes up to almost perfect.

## The temperature in April of a city based on the other 11 months temperature

In [39]:
dataset = pd.read_csv('towns.csv', delimiter=" ")

dataset.head()

Unnamed: 0,#town,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Tallinn,-3,-5,-1,3,10,13,16,15,10,6,1,-2
1,Beijing,-3,0,6,13,20,24,26,25,20,13,5,-1
2,Berlin,0,-1,4,7,12,16,18,17,14,9,4,1
3,Buenos_Aires,23,22,20,16,13,10,10,11,13,16,18,22
4,Cairo,13,15,17,21,25,27,28,27,26,23,19,15


In [40]:
Y = dataset[['Apr']]
X = dataset[['Jan',	'Feb', 'Mar',	'May',	'Jun',	'Jul',	'Aug',	'Sep',	'Oct',	'Nov',	'Dec']]
predict(X, Y)

Predictions:  [[ 6.53337087]
 [ 2.84731699]
 [ 7.24434419]
 [10.25578562]
 [ 2.11627806]]
Score:  0.5861635413481388
Coefficients:  [[-0.17192502  0.01980581  0.48006405  1.29791111 -0.19173083  0.21578375
  -0.76965404  1.28157241 -1.42870763 -0.37028702  0.75925289]]


### Conclusions

We can see that the model did not behave as well as the previous ones, and that is because it is hard for the model to know that in order to get a good prediction for a month's temperature, the months right before or right after should have an higher weight than the other ones.

## Collect the latitudes of the given cities from the Internet. Predict the latitude using the temperature values in the cities

In [41]:
dataset['lat'] = [59.4370, 39.9042, 52.5200, 34.6037, 30.0444, 33.9249, 60.1699, 51.5072, 55.7558, 45.4215, 48.8566, 56.9677, 41.9028, 1.3521, 59.3293, 38.9072]

dataset.head()

Unnamed: 0,#town,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,lat
0,Tallinn,-3,-5,-1,3,10,13,16,15,10,6,1,-2,59.437
1,Beijing,-3,0,6,13,20,24,26,25,20,13,5,-1,39.9042
2,Berlin,0,-1,4,7,12,16,18,17,14,9,4,1,52.52
3,Buenos_Aires,23,22,20,16,13,10,10,11,13,16,18,22,34.6037
4,Cairo,13,15,17,21,25,27,28,27,26,23,19,15,30.0444


In [42]:
Y = dataset['lat']
X = dataset[['Jan',	'Feb', 'Mar', 'Apr',	'May',	'Jun',	'Jul',	'Aug',	'Sep',	'Oct',	'Nov',	'Dec']]

predict(X, Y)

Predictions:  [41.53988021 34.28788909 39.66038551 58.24757071 62.03411894]
Score:  0.2727284921337718
Coefficients:  [ 3.24451746 -0.81895501 -5.8189146  -1.06379346 -1.5179627   9.71423963
 -9.65818579 -2.235073    6.28362631  0.29827378 -2.15675353  1.27441442]


### Conclusions

The cities with the highest latitude are usually the ones with the higher absolute temperatures, or, in other words, that have the coldest temperatures in the winter. That is why the model should give the winter months more weight, when deciding which latitude a city is in.