In [None]:
# Importar las librerías a utilizar
import pandas as pd
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2,'font.family': [u'times']})

import matplotlib.pylab as plt
import numpy as np

In [None]:
# Leer los datos a analizar
ice = pd.read_csv("SeaIce.txt", delim_whitespace=True)
print('shape: ', ice.shape)
ice.head() 

In [None]:
# Observar la media negativa
ice.mean()

In [None]:
# Visualizar datos
x = ice.year
y = ice.extent
plt.scatter(x, y, color = 'red')
plt.xlabel('Year')
plt.ylabel('Extent')

In [None]:
# Revisar valores únicos
print ('Valores distintos en el campo data_type:', np.unique(ice.data_type.values))

In [None]:
# Revisar tipos de datos distintos a Goddard y NRTSI-G 
print (ice[(ice.data_type != 'Goddard') & (ice.data_type != 'NRTSI-G')])

In [None]:
# Ahora se pueden limpiar los datos no esperados:
ice2 = ice[ice.data_type != '-9999']
print ('shape:', ice2.shape)
# Se vuelven a graficar
x = ice2.year
y = ice2.extent
plt.scatter(x, y, color = 'red')
plt.xlabel('Month')
plt.ylabel('Extent')

In [None]:
# Eliminar valores no deseados en el campo data type
ice2 = ice[ice.data_type != '-9999']

In [None]:
# Se exploran relaciones lineales
sns.lmplot("mo","extent", ice2)

In [None]:
# Calcular la media para cada mes
grouped = ice2.groupby('mo')
month_means = grouped.extent.mean()
month_variances = grouped.extent.var()
print ('Means:', month_means)
print ('Variances:',month_variances)

In [None]:
# Normalización de los valores por mes con base en la media
for i in range (12):
    ice2.extent[ice2.mo == i+1] = 100*( ice2.extent[ice2.mo == i+1] - month_means[i+1]) /month_means.mean ()
sns.lmplot("mo", "extent", ice2)

In [None]:
print ('mean:', ice2.extent.mean())
print ('var:', ice2.extent.var())

In [None]:
# Se imprime una gráfica de dispersión y un modelo de correlación lineal
sns.lmplot("year", "extent", ice2,size = 5.2, aspect = 2)

In [None]:
# Análisis para el mes de enero
jan = ice2[ice2.mo == 1];
sns.lmplot("year", "extent", jan,size = 5.2, aspect = 2)

In [None]:
# Análisis para el mes de agosto
aug = ice2[ice2.mo == 8]
sns.lmplot("year", "extent", aug,size = 5.2, aspect = 2)

In [None]:
# Usar el módulo de regresión lineal de sklearn
from sklearn.linear_model import LinearRegression

est = LinearRegression(fit_intercept = True)

x = ice2[['year']]
y = ice2[['extent']]

est.fit(x, y)

print ("Coefficients:", est.coef_)
print ("Intercept:", est.intercept_)

In [None]:
from sklearn import metrics
y_hat = est.predict(x)
print ("MSE:", metrics.mean_squared_error(y_hat , y))
print ("R^2:", metrics.r2_score(y_hat , y))
print ('var:', y.var())

In [None]:
x = [[2025]]
y_hat = est.predict(x)
m = 1 # Enero
y_hat = (y_hat*month_means.mean () /100) + month_means[m]
print ("Predicción de extensión del hielo para enero de 2025 (en millones de km cuadrados):", y_hat)