# Assignment 3
## Download Dependencies - Dataset, Pre-Trained Models
* Download a pre-trained word embedding vector from https://nlp.stanford.edu/projects/glove to help with the assignment and classification
* Download the IMDB movie review dataset from https://ai.stanford.edu/~amaas/data/sentiment/

In [60]:
import os
import urllib.request
import zipfile
import tarfile

def download_and_extract_if_not_exist(url, extract_path, zip_file_name, zip_file_contents_file_name):

    # If any of the files already exist, skip the download - it's already been done and extracted
    if os.path.exists(os.path.join(extract_path, zip_file_contents_file_name)):
        print("Files already exist. Skipping download.")
        return

    # Check if the zip file already exists, if it does, skip the download to save time
    zip_file_path = os.path.join(extract_path, zip_file_name)
    if not os.path.exists(zip_file_path):
        print("Downloading zip file...")
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        # Download the zip file
        zip_file_path, _ = urllib.request.urlretrieve(url, zip_file_path)
    else:
        print("Zip file already exists. Skipping download.")

    # Extract the contents of the zip file to the given directory based on the file extension
    if zip_file_path.endswith(".zip"):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
    elif zip_file_path.endswith(".tar.gz") or zip_file_path.endswith(".tgz"):
        with tarfile.open(zip_file_path, 'r:gz') as tar_ref:
            tar_ref.extractall(extract_path)

    # Close and remove the zip file to free up space
    os.remove(zip_file_path)
    print(f"Files extracted to: {extract_path}")

### Download Pre-Trained Word Embeddings

In [61]:
# Download the word embeddings into the following folder: 
folder_to_download = "./models"

# 6 Billion token model
embedding_url = "https://nlp.stanford.edu/data/glove.6B.zip"
file_name_to_download = "glove.6B.zip"

# 42 Billion token model
# embedding_url = "https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip"
# file_name_to_download = "glove.42B.300d.zip"

# Download and extract the embedding file, then set a variable with the path to the embedding file
download_and_extract_if_not_exist(embedding_url, folder_to_download, file_name_to_download, "glove.6B.50d.txt")
word_embedding_file_path = os.path.join(folder_to_download, "glove.6B.50d.txt")

Downloading zip file...
Files extracted to: ./models


### Download Movie Review Dataset

In [62]:
# Download the dataset into the following folder: 
folder_to_download = "./dataset"

# 6 Billion token model
dataset_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
file_name_to_download = "aclImdb_v1.tar.gz"

# Download and extract, then set the path to the dataset
download_and_extract_if_not_exist(dataset_url, folder_to_download, file_name_to_download, "glove.6B.50d.txt")
dataset_file_path = os.path.join(folder_to_download, "glove.6B.50d.txt")

Downloading zip file...
Files extracted to: ./dataset


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset