In [72]:
import pandas as pd
import string, re
import sys, os
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from itertools import dropwhile
from copy import deepcopy

In [73]:
stops = set(stopwords.words('english'))

## Text Strings from SRT files

based on [pablo-var/learn-english-words-from-srt](https://github.com/pablo-var/learn-english-words-from-srt) repository

In [74]:
def is_time_stamp(l):
  if l[:2].isnumeric() and l[2] == ':':
    return True
  return False

def has_letters(line):
  if re.search('[a-zA-Z]', line):
    return True
  return False

def remove_non_ascii(text):
    return ''.join(i for i in text if ord(i)<128)

def has_no_text(line):
  l = line.strip()
  if not len(l):
    return True
  if l.isnumeric():
    return True
  if is_time_stamp(l):
    return True
  if l[0] == '(' and l[-1] == ')':
    return True
  if not has_letters(line):
    return True
  return False

def is_lowercase_letter_or_comma(letter):
  if letter.isalpha() and letter.lower() == letter:
    return True
  if letter == ',':
    return True
  return False

def clean_up(lines):
  """
  Get rid of all non-text lines and
  try to combine text broken into multiple lines
  """
  new_lines = []
  for line in lines[1:]:
    line = remove_non_ascii(line)
    if has_no_text(line):
      continue
    elif len(new_lines) and is_lowercase_letter_or_comma(line[0]):
      #combine with previous line
      new_lines[-1] = new_lines[-1].strip() + ' ' + line
    else:
      #append line
      new_lines.append(line)
  return new_lines

movies = pd.DataFrame([], columns=['Title', 'Year', 'Words'])

for file_name in os.listdir("data/final_project/srts"):
    
    with open(os.path.join("data/final_project/srts", file_name), 'r', encoding="ISO-8859-1") as f:
       lines = f.readlines()
       new_lines = clean_up(lines)
       all_lines_str = ' '.join(new_lines)
       all_lines_str = all_lines_str.replace('<i>', '')
       all_lines_str = all_lines_str.replace('</i>', '')

    movie_strs = file_name[:-4].split('-')
    l = len(movie_strs)
    movie_name = ' '.join(movie_strs[0:l-1])
    movie_year = movie_strs[l-1]
    movie_first_word = movie_strs[0]
    movie_last_word = movie_strs[l-2]

    movie = pd.DataFrame([(movie_name, movie_year, all_lines_str)], columns=['Title', 'Year', 'Words'])

    movies = pd.concat([movies, movie], ignore_index=True, axis=0)

movies

Unnamed: 0,Title,Year,Words
0,Gravity,2013,Please verify that the P1 ATA removal...\n ......
1,You Can't Take It with You,1938,"Good morning, Mr. Kirby.\n Good morning.\n - G..."
2,Adventure,1945,"Why don't you put some ant powder on that, cap..."
3,Lethal Weapon 2,1989,I love this job!\n Can't you go faster?\n - Fl...
4,Star Wars: The Rise of Skywalker,2019,At last.\n Snoke trained you well.\n I killed ...
...,...,...,...
874,"Oh, God!",1977,"-Jer, clean or dirty?\n -Marginal.\n No, it's ..."
875,Fast & Furious 6,2013,"Seor O'Conner, seor O'Conner!\n It's okay, you..."
876,The Sound of Music,1965,[ Wind Whistling ]\n [ Wind owling ]\n [ Birds...
877,Independence Day,1996,[Indistinct\n Radio Communication]\n Radio: Fo...


In subtitles, bracketed text represent non-dialogue descriptions such as '[Wind Whistling]' or '[Indistict Radio Communication]'. Removes bracketed text and brackets. Also, line returns, '\n', are replaced with empty strings.

In [75]:
def remove_non_dialogue(s):
    strip_returns = s.replace('\n', '')
    pattern = r'\[.*?\]'
    strip_brackets = re.sub(pattern, '', strip_returns)
    return strip_brackets

movies['Words'] = movies['Words'].apply(lambda x : remove_non_dialogue(x))
movies

Unnamed: 0,Title,Year,Words
0,Gravity,2013,Please verify that the P1 ATA removal... ...on...
1,You Can't Take It with You,1938,"Good morning, Mr. Kirby. Good morning. - Good ..."
2,Adventure,1945,"Why don't you put some ant powder on that, cap..."
3,Lethal Weapon 2,1989,I love this job! Can't you go faster? - Floor ...
4,Star Wars: The Rise of Skywalker,2019,At last. Snoke trained you well. I killed Snok...
...,...,...,...
874,"Oh, God!",1977,"-Jer, clean or dirty? -Marginal. No, it's not...."
875,Fast & Furious 6,2013,"Seor O'Conner, seor O'Conner! It's okay, you'r..."
876,The Sound of Music,1965,The hills are alive With the sound of ...
877,Independence Day,1996,"Radio: For those who haven't, uh, read the pl..."


Combine movie text strings by year. Aggregrate function based on [Stack Overflow Response](https://stackoverflow.com/questions/62729238/how-to-merger-same-column-values-based-on-other-column-values)

In [81]:
movies['Decade'] = movies['Year'].apply(lambda x: x[:-1] + '0')
movies

Unnamed: 0,Title,Year,Words,Decade
0,Gravity,2013,Please verify that the P1 ATA removal... ...on...,2010
1,You Can't Take It with You,1938,"Good morning, Mr. Kirby. Good morning. - Good ...",1930
2,Adventure,1945,"Why don't you put some ant powder on that, cap...",1940
3,Lethal Weapon 2,1989,I love this job! Can't you go faster? - Floor ...,1980
4,Star Wars: The Rise of Skywalker,2019,At last. Snoke trained you well. I killed Snok...,2010
...,...,...,...,...
874,"Oh, God!",1977,"-Jer, clean or dirty? -Marginal. No, it's not....",1970
875,Fast & Furious 6,2013,"Seor O'Conner, seor O'Conner! It's okay, you'r...",2010
876,The Sound of Music,1965,The hills are alive With the sound of ...,1960
877,Independence Day,1996,"Radio: For those who haven't, uh, read the pl...",1990


In [83]:
movie_year_groups = movies.groupby(by="Year", as_index=False).agg(
    {"Words": lambda s: ", ".join(s[~s.isnull()]) if not all(s.isnull()) else np.nan})

movie_decade_groups = movies.groupby(by="Decade", as_index=False).agg(
    {"Words": lambda s: ", ".join(s[~s.isnull()]) if not all(s.isnull()) else np.nan})

movie_decade_groups.drop(index=movie_decade_groups.index[0], axis=0, inplace=True)
movie_decade_groups


Unnamed: 0,Decade,Words
1,1920,They called her frivolous Sal A peculiar sort...
2,1930,"Good morning, Mr. Kirby. Good morning. - Good ..."
3,1940,"Why don't you put some ant powder on that, cap..."
4,1950,Ohh. You see what happened there? I'm pretty s...
5,1960,"Ladies and gentlemen, we are about to begin ou..."
6,1970,They've killed the boy! They've killed young P...
7,1980,I love this job! Can't you go faster? - Floor ...
8,1990,There have been many tales of the great warrio...
9,2000,"Rachel, let me see. - Can I see? - Finders kee..."
10,2010,Please verify that the P1 ATA removal... ...on...


Create txt files for each year with the file named after the year. Based on [Stackoverflow Solution](https://stackoverflow.com/questions/28377072/write-contents-of-dataframe-to-txt-files-with-one-file-per-row)

In [84]:
for x in movie_decade_groups.iterrows():
    pd.DataFrame([x[1][1]]).to_csv(x[1][0]+".txt", header=False, index=False)

In [None]:
movies.to_csv('data/movie_data.csv', index=False)
movie_year_groups.to_csv('data/movie_data_by_year.csv', index=False)