In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'iowa-house-prices:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F13812%2F18623%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240307%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240307T093400Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D227015aa20192ff4ecf0daac08545c49d2f9c69d6a9398b955fb5bcc065e6bfd97fed7f75bd80f953b497dc31826b3fb44063910ea45bf3af7a1a34f92edb25c8c26db42abcca8f95703acab4ac63522e7673a4e1261bda433472bdb1aa3dc377d7017ca1b3cd298dd4e2eddc8fa270dd91c59a0f5a757f012af2521d9dc56703e0fdd4f7e81fdb25b4c840e44dd657354ea9d2ba3d3e2f92b9adde450146f8c8b3cda50bcc4098b18b226c3e9b0da57fbb052c58cc73c7e94d871defe88de064fea4ca767294f302d5aac89e5582d55eb70756b32239f03e3a80b1ea5f8714bcbaa995995129c2513459d5d7fdee352883e5877dd57376060101dd8eda3593b,melbourne-housing-snapshot:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2709%2F38454%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240307%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240307T093400Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D62b03ffd498e3f964804fc06757dc34031a761f4692cf110d8f4c02fb2e86f17bcf48e42792a7c8674cef0c197896925e426e503c3e1d1dbcab16b582a60210b9150b885fbfbc6fbe172990710fd5bc163203fa88ae7741ee7d4205693c162b71337e43e79241245c82e940d47a50a1777ede74c7967417d78b11a44a9c3f7585c417ba6270fda28e36780803d6a82b766836b3236403498a15bf244c3faa02fa345c0f4214780da080115c4d2ad1789b5b57dfbd36cac9a438ac862f886d819d5f040aa3682233361269b426c8f0a6196b91f9dc33298bd8e9b3952263905f6b576b872e2d66a48bfb70b03b0e092a42013da657f4c2c20f4b71cfe917eefc3'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


**[Machine Learning Course Home Page](https://www.kaggle.com/learn/machine-learning)**

---


This exercise will test your ability to read a data file and understand statistics about the data.

In later exercises, you will apply techniques to filter the data, build a machine learning model, and iteratively improve your model.

The course examples use data from Melbourne. To ensure you can apply these techniques on your own, you will have to apply them to a new dataset (with house prices from Iowa).

The exercises use a "notebook" coding environment.  In case you are unfamiliar with notebooks, we have a [90-second intro video](https://www.youtube.com/watch?v=4C2qMnaIKL4).

# Exercises

Run the following cell to set up code-checking, which will verify your work as you go.

In [None]:
# Set up code checking
from learntools.core import binder
binder.bind(globals())
from learntools.machine_learning.ex2 import *
from subprocess import check_output
print("Setup Complete")

## Step 1: Loading Data
Read the Iowa data file into a Pandas DataFrame called `home_data`.

### Import of 'pandas' library

In [None]:
import pandas as pd

**Get data**<br>
- We will then get the required datasets using '+ Add data' option on right upper corner of this screen.
- Access the datasets using the file paths and convert into pandas dataframes making use of proper pandas commands.

In [None]:
iowa_file_path = '../input/iowa-house-prices/train.csv'
home_data = pd.read_csv(iowa_file_path)

In [None]:
# Lines below will give you a hint or solution code
# step_1.hint()
# step_1.solution()

## Step 2: Review The Data
Use the command you learned to view summary statistics of the data. Then fill in variables to answer the following questions

In [None]:
# Read data summary
data_summary = home_data.describe()

# Locate required values in the dataframe
avg_lot_size = round(data_summary.loc['mean', 'LotArea' ])
newest_home_age = int(2021 - data_summary.loc['max', 'YearBuilt'])

print('\navg_lot_size:', avg_lot_size)
print('\nnewest_home_age:', newest_home_age)

In [None]:
# What is the average lot size (rounded to nearest integer)?
avg_lot_size = 10517

# As of today, how old is the newest home (current year - the date in which it was built)
newest_home_age = 11

# Checks your answers
step_2.check()

In [None]:
# step_2.hint()
# step_2.solution()

**Second dataset**

In [None]:
melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)

In [None]:
features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

**Check types of data**

In [None]:
melbourne_data[features].info()

**Statistics of data**

In [None]:
melbourne_data[features].describe()

**Check how many features available**

Taotal features (columns) of dataset in the form of a python list. We can find the total number of elements from the list that represents total features.

In [None]:
len(melbourne_data.columns)

**Drop rows containing missing data**
- axis=0, means row
- axis=1, means column

__dropna()__
<br> This command will drop the rows(if, axis=0) or columns(if, axis=1) having null values such as 'NaN'. By pandas default, axis=0.

In [None]:
# The Melbourne data has some missing values (some houses for which some variables weren't recorded.)(不存在某些值。)
# We'll learn to handle missing values in a later tutorial.
# Your Iowa data doesn't have missing values in the columns you use.
# So we will take the simplest option for now, and drop houses from our data.
# Don't worry about this much for now, though the code is:

# dropna drops missing values (think of na as "not available")(not available)
# axis,axis=0 down(row)(axis=None),axis=1 cross(column)
melbourne_data = melbourne_data.dropna(axis=0)

Types of <u>*data*</u>

In [None]:
melbourne_data.info()

__Statistics of the data__

In [None]:
melbourne_data.describe()

**First five rows**

In [None]:
y = melbourne_data['Price']
y.head()

In [None]:
x = melbourne_data['Price']
x.head()

In [None]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
melbourne_features

## Think About Your Data

The newest house in your data isn't that new.  A few potential explanations for this:
1. They haven't built new houses where this data was collected.
1. The data was collected a long time ago. Houses built after the data publication wouldn't show up.

If the reason is explanation #1 above, does that affect your trust in the model you build with this data? What about if it is reason #2?

How could you dig into the data to see which explanation is more plausible?

Check out this **[discussion thread](https://www.kaggle.com/learn-forum/60581)** to see what others think or to add your ideas.

# Keep Going

You are ready for **[Your First Machine Learning Model](https://www.kaggle.com/dansbecker/your-first-machine-learning-model).**


In [None]:
X = melbourne_data[melbourne_features]
y = melbourne_data["Price"]
print('\nX.head()\n: ', X.head(), '\ny.head()\n:', y.head())

---
**[Machine Learning Course Home Page](https://www.kaggle.com/learn/machine-learning)**



**DecisionTreeRegressor**
<br> This is a very basic Machine Learning model that is very often used.

In [None]:
from sklearn.tree import DecisionTreeRegressor
# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=1, splitter='best')

In [None]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))