In [None]:
!pip install orjson
!pip install pytrends

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import orjson
import datetime
from datetime import timedelta
from pytrends.request import TrendReq
from dateutil import parser
from tqdm import tqdm
import warnings
from functools import reduce
from time import sleep



In [None]:
def get_creation(wikipedia_link, session):
  title = wikipedia_link.split('/')[-1]
  revisions_url = f'https://en.wikipedia.org/w/api.php?action=query&format=json&titles={title}&prop=revisions&rvprop=timestamp&rvlimit=1&rvdir=newer'
  response = session.get(revisions_url, headers={'User-Agent':'Mozilla/5.0'})
  data = response.json()
  page_id = list(data['query']['pages'].keys())[0]
  revision_date_str = data['query']['pages'][page_id]['revisions'][0]['timestamp']
  revision_date_obj = datetime.datetime.strptime(revision_date_str, '%Y-%m-%dT%H:%M:%SZ')
  return revision_date_obj.strftime('%Y%m%d')

def get_release(wikipedia_link, session):
  try:
    response = session.get(wikipedia_link, headers={'User-Agent': 'Mozilla/5.0'})
    page_content = response.text
    soup = BeautifulSoup(page_content, 'html.parser')
    table = soup.find('table', class_='infobox vevent')
    rows = soup.find_all('tr')

    for row in rows:
      if "Release dates" in row.text:
        listy = row.text.strip().split('\n')
        for item in listy:
          if 'United States' in item or 'US' in item:
            result = re.sub(r'\([^)]*\)', '', item)
            result = result.replace('\xa0', ' ').strip()
            parsed_date = parser.parse(result)
            return parsed_date.strftime('%Y%m%d')
        for item in listy:
          if '(' in item:
            result = re.sub(r'\([^)]*\)', '', item)
            result = result.replace('\xa0', ' ').strip()
            parsed_date = parser.parse(result)
            return parsed_date.strftime('%Y%m%d')

      elif 'Release date' in row.text:
        listy = row.text.strip().split('\n')
        for item in listy:
          if '(' in item:
            result = re.sub(r'\([^)]*\)', '', item)
            result = result.replace('\xa0', ' ').strip()
            parsed_date = parser.parse(result)
            return parsed_date.strftime('%Y%m%d')
  except:
    return None

def api_call(url, first_date, end_date, session):
    end_part = url.split('/')[-1]
    api = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{end_part}/daily/{first_date}/{end_date}'
    resp = session.get(api, headers={'User-Agent':'Mozilla/5.0'})
    json_data = orjson.loads(resp.content)

    start = datetime.datetime.strptime(first_date, '%Y%m%d')
    stop = datetime.datetime.strptime(end_date, '%Y%m%d')

    views_dict = {item['timestamp'][:8]: item['views'] for item in json_data['items']}

    current_date = start
    j_data = []
    while current_date <= stop:
        j_data.append(views_dict.get(current_date.strftime('%Y%m%d'), 0))
        current_date += datetime.timedelta(days=1)

    return j_data

def get_wiki(title, link, session, timestamp):
  a = {'title': title}
  a['creation'] = get_creation(link, session)
  a['release'] = get_release(link, session)
  a['current'] = datetime.datetime.now().date()
  if (a['release'] is not None) and (a['current'] != (a['release'] - datetime.timedelta(timestamp))):
    warnings.warn('Current date threshold differs from release date shown by Wikipedia. Models are calibrated for 90 days, 60 days and 30 days prior to release, and results may vary for use at other times.')
  a['wiki_data'] = api_call(link, a['creation'], a['current'].strftime('%Y%m%d'), session)

  if timestamp == '30':
    if len(a['wiki_data']) > 90:
      a['wiki_90_pure'] = a['wiki_data'][-90:-60]
      a['wiki_90_raw'] = a['wiki_data'][:-60]
      a['wiki_60_pure'] = a['wiki_data'][-60:-30]
      a['wiki_60_raw'] = a['wiki_data'][:-30]
      a['wiki_30_pure'] = a['wiki_data'][-30:]
      a['wiki_30_raw'] = a['wiki_data']
    elif len(a['wiki_data']) > 60:
      a['wiki_90_pure'] = None
      a['wiki_90_raw'] = a['wiki_data'][:-60]
      a['wiki_60_pure'] = a['wiki_data'][-60:-30]
      a['wiki_60_raw'] = a['wiki_data'][:-30]
      a['wiki_30_pure'] = a['wiki_data'][-30:]
      a['wiki_30_raw'] = a['wiki_data']
    elif len(a['wiki_data']) > 30:
      a['wiki_90_pure'] = None
      a['wiki_90_raw'] = None
      a['wiki_60_pure'] = None
      a['wiki_60_raw'] = a['wiki_data'][:-30]
      a['wiki_30_pure'] = a['wiki_data'][-30:]
      a['wiki_30_raw'] = a['wiki_data']
    else:
      a['wiki_90_pure'] = None
      a['wiki_90_raw'] = None
      a['wiki_60_pure'] = None
      a['wiki_60_raw'] = None
      a['wiki_30_pure'] = None
      a['wiki_30_raw'] = a['wiki_data']

  elif timestamp == '60':
    if len(a['wiki_data']) > 60:
      a['wiki_90_pure'] = a['wiki_data'][-60:-30]
      a['wiki_90_raw'] = a['wiki_data'][:-30]
      a['wiki_60_pure'] = a['wiki_data'][-30:]
      a['wiki_60_raw'] = a['wiki_data']
    elif len(a['wiki_data']) > 30:
      a['wiki_90_pure'] = None
      a['wiki_90_raw'] = a['wiki_data'][:-30]
      a['wiki_60_pure'] = a['wiki_data'][-30:]
      a['wiki_60_raw'] = a['wiki_data']
    else:
      a['wiki_90_pure'] = None
      a['wiki_90_raw'] = None
      a['wiki_60_pure'] = None
      a['wiki_60_raw'] = a['wiki_data']

  elif timestamp == '90':
    if len(a['wiki_data']) > 30:
      a['wiki_90_pure'] = a['wiki_data'][-30:]
      a['wiki_90_raw'] = a['wiki_data']
    else:
      a['wiki_90_pure'] = None
      a['wiki_90_raw'] = a['wiki_data']

  else:
    raise ValueError('Not valid timestamp entry.')

  return a

# Google Trends

def ready_title(text):
  title = title.strip()
  if len(title) >= 100:
    film_title = film_title[:99]
    a = film_title.rfind(':')
    if a == -1:
      return film_title[:a]
    else:
      return film_title[:a]

def get_trends_data(film_title, pytrends, start_time, end_time):
  # must be in %Y-%m-%d format
  pytrends.build_payload(kw_list=[film_title], timeframe=f'{start_time} {end_time}')
  trends_data = pytrends.interest_over_time()
  trends_data = trends_data.rename(columns={film_title: 'film'})
  return trends_data

def get_gt(title, t, start_time, end_time):
  king = True
  while king:
    try:
        a = get_trends_data(title, t, start_time, end_time)
        sleep(1)

        try:
          return a['film'].to_list()
        except KeyError as e:
          search_text = title
          index = search_text.rfind(':')
          if index == -1:
            title = search_text[: len(search_text) // 2]
          else:
            title = search_text[:index]
    except Exception as e:
      print(e,  ': rate limit - relaunching Google Trends')
      sleep(60)
      continue

def gt(release, title, t, timestamp):
  current = datetime.datetime.now().days()
  current_str = current.strftime('%Y-%m-%d')
  back_30 = datetime.timedelta(days = 30).strftime('%Y-%m-%d')
  back_60 = datetime.timedelta(days = 60).strftime('%Y-%m-%d')
  back_90 = datetime.timedelta(days = 90).strftime('%Y-%m-%d')
  title = ready_title(title)

  if (release is not None) and (current != release - datetime.timedelta(timestamp)):
    warnings.warn('Current date threshold differs from release date shown by Wikipedia. Models are calibrated for 90 days, 60 days and 30 days prior to release, and results may vary for use at other times.')

  if timestamp == 30:
    a = {'gt_90': get_gt(title, t, back_90, back_60),
        'gt_60_pure': get_gt(title, t, back_60, back_30),
        'gt_60_raw': get_gt(title, t, back_90, back_30),
        'gt_30_pure': get_gt(title, t, back_30, current),
        'gt_30_raw': get_gt(title, t, back_90, current)}
  elif timestamp == 60:
    a = {'gt_90': get_gt(title, t, back_60, back_30),
        'gt_60_pure': get_gt(title, t, back_30, current),
        'gt_60_raw': get_gt(title, t, back_60, back_30)}
  elif timestamp == 90:
    a = {'gt_90': get_gt(title, t, back_90, back_60)}
  else:
    raise ValueError('Not valid timestamp entry.')

  a['title'] = title
  return a

def individual_pull(title, wikipedia_url, timestamp):
  s = requests.Session()
  t = TrendReq()

  wiki_dict = get_wiki(title, wikipedia_url, s, timestamp)
  gt_dict = gt(wiki_dict['release'], title, t, timestamp)
  df_wiki = pd.DataFrame(wiki_dict)
  df_gt = pd.DataFrame(gt_dict)

  master_df = pd.merge(df_wiki, df_gt, how='inner', on='title').drop(columns=['release', 'link'])

  return master_df





In [None]:
from urllib3.util.retry import RequestHistory
def filter_it(subset):
  if len(subset) > 90:
    a = {}
    a['wiki_earliest_set_pure'] = subset[-90:-60]
    a['wiki_earliest_set_com'] = subset[-90:]
    a['wiki_mid_set_pure'] = subset[-60:-30]
    a['wiki_mid_set_com'] = subset[-60:]
    a['wiki_latest_set'] = subset[-30:]
    a['wiki_all_data'] = subset
    return a
  elif len(subset) > 60:
    a = {}
    a['wiki_earliest_set_pure'] = None
    a['wiki_earliest_set_com'] = subset[-90:]
    a['wiki_mid_set_pure'] = subset[-60:-30]
    a['wiki_mid_set_com'] = subset[-60:]
    a['wiki_latest_set'] = subset[-30:]
    a['wiki_all_data'] = subset
  elif len(subset) > 30:
    a = {}
    a['wiki_earliest_set_pure'] = None
    a['wiki_earliest_set_com'] = None
    a['wiki_mid_set_pure'] = None
    a['wiki_mid_set_com'] = subset[-60:]
    a['wiki_latest_set'] = subset[-30:]
    a['wiki_all_data'] = subset
  else:
    a = {}
    a['wiki_earliest_set_pure'] = None
    a['wiki_earliest_set_com'] = None
    a['wiki_mid_set_pure'] = None
    a['wiki_mid_set_com'] = None
    a['wiki_latest_set'] = None
    a['wiki_all_data'] = subset
  return a

def get_wiki(link, session, timestamp):
  release = get_release(link, session)
  creation = get_creation(link, session)
  data = api_call(link, creation, release, session)
  if timestamp == 90:
    x = filter_it(data[:-90])
  elif timestamp == 60:
    x = filter_it(data[:-60])
  elif timestamp == 30:
    x = filter_it(data[:-30])
  else:
    raise ValueError('Not valid timestamp entry.')

  x['wikipedia_url'] = link
  x['release'] = release
  return x

def ready_title(text):
  title = re.sub(r'\([^)]*\)', '', text)
  title = title.strip()
  if len(title) >= 100:
    film_title = film_title[:99]
    a = film_title.rfind(':')
    if a == -1:
      return film_title[:a]
    else:
      return film_title[:a]
  else:
    return title

def get_trends_data(film_title, release, t, timestamp):
  a = {'title', film_title}
  film_title = ready_title(film_title)
  release = datetime.datetime.strptime(release, "%Y%m%d")
  end = release - pd.DateOffset(days = timestamp)

  times = {
      'gt_earliest_set_pure': (90, 0),
      'gt_earliest_set_raw': (90, 60),
      'gt_mid_set_pure': (60, 30),
      'gt_mid_set_com': (60, 0),
      'gt_latest_set': (30,0)
      }

  for key, item in times:
    start_date = end - datetime.timedelta(item[0]).strftime('%Y-%m-%d')
    end_date = end - datetime.timedelta(item[1]).strftime('%Y-%m-%d')
    retry = True
    while retry:
      try:
        t.build_payload(kw_list=[film_title], timeframe=f'{start_date} {end_date}')
        trends_data = t.interest_over_time()

        try:
          a[key] = trends_data['film'].to_list()
          sleep(1)
          retry = False

        except KeyError:
          index = film_title.rfind(':')
          if index == -1:
            film_title = film_title[: len(film_title) // 2]
          else:
            film_title = film_title[:film_title]

      except Exception as e:
        print(e,  ': rate limit hit - relaunching pull for', film_title)
        sleep(60)

  return a

In [None]:
def master_data(df, timeframe):
  s = requests.Session()
  t = TrendReq()

  tqdm.pandas(desc='{Wikipedia Data Pull}')
  df['wiki_data'] = df['wikipedia_url'].progress_apply(lambda x: get_wiki(x, s, timeframe))
  wiki_df = pd.json_normalize(df['wiki_data'])
  df = pd.merge(df, wiki_df, how='inner', on='wikipedia_url').drop(columns=['wiki_data'])

  tqdm.pandas(desc='{Google Trends Pull - may take multiple iterations}')
  df['gt_data'] = df.progress_apply(lambda x: get_trends_data(x['title'], x['release'], t, timeframe), axis=1)
  gt_df = pd.json_normalize(df['gt_data'])
  df = pd.merge(df, gt_df, how='inner', on='title').drop(columns=['gt_data'])

  return df

df = pd.read_csv('input.csv')
df2 = df[['title', 'wikipedia_url']]

a = master_data(df2, 30)

{Wikipedia Data Pull}: 100%|██████████| 104/104 [00:22<00:00,  4.55it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wiki_data'] = df['wikipedia_url'].progress_apply(lambda x: get_wiki(x, s, timeframe))
{Google Trends Pull - may take multiple iterations}:   1%|          | 1/164 [00:00<00:00, 995.80it/s]


AttributeError: ignored

In [None]:
df1 = pd.read_csv('/content/linksteset.csv')
df2 = pd.read_csv('/content/okok.csv')['title']
df3 = pd.merge(df1, df2, how='inner', on = 'title')
df3.loc[86, 'title'] = 'Big George Foreman (2023)'
df3 = df3[['title', 'wikipedia_url']]
#df4 = pd.read_csv('/content/Box Office - Sheet2.csv')
#df5 = pd.merge(df3, df4, on='title', how='inner')

df5.to_csv('input.csv')