**This is a sample of how to use Data_Preprocessing**\

## Data Preprocessing
- combine the first two batches of files for training usage
- the third batch used for validation
- crawl training, validation, and test images from given URLs 
- extract features: image id, image url, article_title
- tranlsate article title into English using Google Translate API (https://github.com/ssut/py-googletrans) \

Note: The google translate python library only works in Linux OS. Save the translated text on your cloud/local machine for further processing in you use Window OS.

In [47]:
import os
import requests  # to get image from the web
import shutil
import pandas as pd
import cv2
import icrawler
from icrawler.builtin  import GoogleImageCrawler

In [None]:
data_folder=r"../data"
os.path.isdir(data_folder)

In [51]:
train_01, train_02, validation, test = "content2019-01-v3.tsv", "content2019-02-v3.tsv", \
"content2019-03-v3.tsv", "MediaEvalNewsImagesBatch04images.tsv"

In [None]:
def create_folder(path):
    isExist = os.path.exists(path)

    if not isExist:
        os.makedirs(path)

In [None]:
create_folder('processed_data')
create_folder('img')
create_folder(r'processed_data/data')
create_folder(r'processed_data/img')
processed_data_folder=r'processed_data/data'
processed_img_folder=r'processed_data/img'

### Combine Files
We use the first two batches as train set, and the third as validation set\
we combine the fist two files into a whole file and combine all three files as train_eval file for testing.

In [None]:
def combine_files(filenames, output_file, skip):
    """
    combine_csv combine a list of files into one file
    :param filenames: a list of filename
    :param output_file: output file
    """
    with open(output_file, "w", encoding="utf-8") as output:
        for i in range(len(filenames)):
            with open(filenames[i], "r", encoding="utf-8") as infile:
                if i !=0 and skip:
                    next(infile)
                contents = infile.read()
                output.write(contents)

In [None]:
combine_files([os.path.join(data_folder, train_01), os.path.join(data_folder, train_02)], \
              os.path.join(processed_data_folder, "train.tsv"))
combine_files([os.path.join(data_folder, train_01), os.path.join(data_folder, train_02), \
                     os.path.join(data_folder, validation)], os.path.join(processed_data_folder, "train_eval.tsv"))

In [None]:
df_combine_train=pd.read_csv(os.path.join(processed_data_folder, "train.tsv"), delimiter="\t")

In [None]:
df_combine_train

### Load image from given urls to the image folder

In [None]:
def load_img(data_file, img_folder, img_url_idx, img_id_idx):
    """
    load_img download images from the url,
    save images into the given image folder
    and use image id as the image name
    :param data_file: input file which include information such as img_url, img_id
    :param img_folder: image folder where downloaded image are saved
    :param img_url_idx: column idx of img url in the data_file
    :param img_id_idx: column idx of img id in the data_file
    """
    f = open(data_file, "r", encoding="utf-8")
    next(f)
    print("start loading images")
    for line in f:
        image_url = line.split("\t")[img_url_idx]
        image_id = line.split("\t")[img_id_idx]
        img_path = img_folder
        isExist = os.path.exists(img_path)
        if not isExist:
            # Create a new directory because it does not exist
            os.makedirs(img_path)
            print("The image directory is created!")
        filename = os.path.join(img_path, image_id + ".jpg")
        r = requests.get(image_url, stream=True, headers={'User-agent': 'Mozilla/5.0'})
        if r.status_code == 200:
            with open(filename, 'wb') as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
        else:
            print("img can't be loaded")

In [None]:
load_img(os.path.join(processed_data_folder, "train.tsv"), "img/training", 3, 4)
load_img(os.path.join(processed_data_folder, "train_eval.tsv"), "img/train_eval", 3, 4)
load_img(os.path.join(data_folder, test), "img/test", 0, 1)

### Feature Extraction

In [None]:
def reformat_data_file(input_file, new_file):
    """
    reformat_data_file reformat the given input_file to facilate the further data processing
    :param input_file: origin tsv file
    :param new_file: output tsv file
    """
    f = open(os.path.join(input_file), "r", encoding="utf-8")
    next(f)
    with open(new_file, 'a', encoding="utf-8") as the_file:
        header = "img_id"+"\t"+"img_name"+"\t"+"title"
        the_file.write(header + "\n")
        for line in f:
            image_id = line.split("\t")[4] + ".jpg"
            image_url = line.split("\t")[3].split("/")[-1]
            title = line.split("\t")[6]
            the_file.write(image_id + "\t" +image_url + "\t" + title + "\n")

In [None]:
reformat_data_file(os.path.join(processed_data_folder, "train.tsv"), os.path.join(processed_data_folder, "train_title.tsv"))
reformat_data_file(os.path.join(processed_data_folder, "train_eval.tsv"), os.path.join(processed_data_folder, "train_eval_title.tsv"))

In [None]:
reformat_data_file(os.path.join(data_folder, "content2019-03-v3.tsv"), os.path.join(processed_data_folder, "eval_title.tsv"))

### Tranlate Title
translate article title into English 

In [None]:
def text_trans(file_path):
    file_text = open(file_path, 'r')
    translator = Translator()
    lines_text = file_text.readlines()
    cnt = 0
    trans_lines_text = []
    for l_text in lines_text:
        spes_text = l_text.split("\t")
        result_text = translator.translate(spes_text[7], src='de')
        trans_lines_text.append(result_text.text)
        time.sleep(1)
        cnt += 1
        print(cnt)
        if cnt % 50 == 0:
            print("finish sub_lines_test_text ", cnt)
    return trans_lines_text

In [None]:
def comb_title_eng(orig_file, titles_eng, output_file, aID_idx, title_idx):
    lines = [line.strip() for line in open(orig_file, 'r', encoding="utf-8")]
    with open(output_file, 'a', encoding="utf-8") as the_file:
        for i in range(len(lines)):
            title_eng = titles_eng[i].rstrip("\n")
            segs = lines[i].strip("\n").split(",")
            the_file.write(segs[aID_idx] + "\t" + segs[title_idx] + '\t' + title_eng + "\n")

In [None]:
def trans_title(orig_file,output_file):
    trans_lines_text=text_trans(orig_file)
    comb_title_eng(orig_file, trans_lines_text, output_file)

In [None]:
trans_title(os.path.join(processed_data_folder, "train_title.tsv"), os.path.join(processed_data_folder,"train_title_eng.tsv"), 0, 1)

In [None]:
trans_title(os.path.join(processed_data_folder, "eval_title.tsv"), os.path.join(processed_data_folder,"eval_title_eng.tsv"), 0, 1)

In [None]:
trans_title(os.path.join(data_folder, "MediaEvalNewsImagesBatch04articles.tsv"), os.path.join(processed_data_folder,"test_title_eng.tsv"), 0 , 4)