In [5]:
# The command that was used in powershell
# to get the first n rows of the original csv dataset:
#  Import-Csv -Path C:\path\to\news_cleaned_2018_02_13.csv "," -Encoding utf8 |
# >>     Select -First 1000000 |
# >>     Export-Csv -Path C:\path\to\news_10_percent_utf8.csv -NoTypeInformation (-Encoding utf8

In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#import spacy
import os
#import time
from collections import Counter

#from pandarallel import pandarallel
#num_cores = os.cpu_count()
#pandarallel.initialize(progress_bar=True, nb_workers=num_cores)

In [7]:
# count urls, dates, numeric values
def count_urls(text):
  import re

  url_pattern = r'https?://\S+'
  urls = re.findall(url_pattern, str(text))
  return len(urls)

def count_dates(text):
  import datefinder

  matches = datefinder.find_dates(text, source=True, strict=True)
  return len(list(matches))

def count_numeric_values(text):
  import re

  numeric_pattern = r'\b\d+(?:\.\d+)?\b'
  numeric_values = re.findall(numeric_pattern, text)
  return len(numeric_values)

def run_and_write_all_counts(type, df, suffix):
  display(f"Counting {type}...")
  df_counts_of_urls = df['content'].parallel_apply(count_urls)
  df_counts_of_dates = df['content'].parallel_apply(count_dates)
  df_counts_of_numeric_values = df['content'].parallel_apply(count_numeric_values)

  combined_df = pd.concat([df_counts_of_urls, df_counts_of_dates, df_counts_of_numeric_values], axis=1)
  combined_df = pd.concat([df_counts_of_urls], axis=1)
  
  combined_df.columns = ['url_count', 'date_count', 'numeric_count']
  display(combined_df.head())

  # write to file
  #counts_output_file = f"{type}_counts"
  #combined_df.to_csv(f"./export/{counts_output_file}{suffix}ASDF.csv", encoding='utf-8', index=False, header=True)
  #display(f"Wrote {type} counts to {counts_output_file}")

  return combined_df # return results if not writing to file

def replace_url_column(df, df_mod, output_name):
  df['url_count'] = df_mod['url_count']

  # Convert the 'url_count' column to integer dtype if there are no NaNs
  #if df['url_count'].isnull().sum() == 0:
  #df['url_count'] = df['url_count'].astype(int)

  df.to_csv(f"./export/{output_name}.csv", encoding='utf-8', index=False, header=True)
  display(f"Wrote {type} counts to {output_name}")
  return


In [2]:
#spacy.require_gpu()
nlp = spacy.load('en_core_web_sm', exclude=["parser","ner"])

In [115]:
def get_view(df_news, is_reliable):
  df_news_type_view = df_news[df_news["type"].isin(["reliable", "fake"])][["id", "type", "content"]]

  #data splitting 80%, 10%, 10%
  df_train, df_test = train_test_split(df_news_type_view, test_size=0.1, random_state=1)
  df_train, df_val = train_test_split(df_train, test_size=0.11, random_state=1)

  if is_reliable:
    df_reliable_news_view = df_train[df_train["type"] == "reliable"][["id", "content"]]
    return df_reliable_news_view
  else:
    df_fake_news_view = df_train[df_train["type"] == "fake"][["id", "content"]]

    return df_fake_news_view

def process_text(row, nlp, counters):
  tokens = nlp(row["content"])
  tokens = [token.lemma_ for token in tokens if (not token.is_stop and not token.is_punct)]
  #return tokens
  
  counters[row.name % 4].update(tokens)
  #return counters

def process_text_keep_stop(row, nlp, counters):
  tokens = nlp(row["content"])
  tokens = [token.lemma_ for token in tokens if (not token.is_punct)]
  #return tokens
  
  counters[row.name % 4].update(tokens)
  #return counters

def process_chunk(df_chunk):
  def process_text(text, nlp, counter):
    tokens = nlp(text)
    tokens = [token.lemma_ for token in tokens if (not token.is_stop and not token.is_punct)]
    #return tokens
    
    counter.update(tokens)
  
  # Initialize a counter for this chunk
  import spacy
  nlp = spacy.load('en_core_web_sm', exclude=["parser","ner"])

  from collections import Counter
  chunk_counter = Counter()

  df_cleaned = df_chunk.dropna()

  # Update the counter for each row in the chunk
  df_cleaned.apply(process_text, nlp=nlp, counter=chunk_counter)
  
  return chunk_counter

def run_and_write_frequency(type, df_tokenized, name):
  display(f"Counting {type}...")
  # Tokenize, add unique words to counter

  # Split the DataFrame into chunks
  num_cores = 4
  num_rows = len(df_tokenized)
  chunk_size = num_rows // num_cores
  chunks = []

  # Split the DataFrame into chunks
  for i in range(0, num_rows, chunk_size):
    chunk = df_tokenized.iloc[i:i+chunk_size]
    chunks.append(chunk)

  # If there are remaining rows, add them to the last chunk
  if num_rows % chunk_size != 0:
      last_chunk = chunks.pop()  # Remove the last chunk
      chunks[-1] = pd.concat([chunks[-1], last_chunk])

  df_chunks = pd.DataFrame(chunks)
  
  # count occurrances of words in each chunk
  counters = df_chunks.parallel_apply(process_chunk, nlp=nlp, axis=1)

  #concatenate the counters into one counter
  counter = counters[0]
  for i, current_counter in enumerate(counters):
    if i == 0:
      continue
    counter.update(current_counter)
  
  # Sort the data by frequency
  counter = counter.most_common()

  for index, word_freq in enumerate(counter):
    print(str(word_freq).center(20), end=" ")
    if ((index+1)%5 == 0):
      print()

    # get only the top 100
    if (index > 3):
      break
  
  # Convert Counter to DataFrame
  counter_df = pd.DataFrame(list(counter), columns=['Word', 'Count'])

  # Save DataFrame to CSV file
  counter_df.to_csv(f'./export_freq/{name}.csv', index=False)

def run_and_write_frequency_faster(type, df, name, keep_stop):
  display(f"Counting {type}...")
  # Tokenize, add unique words to counter

  # Split the DataFrame into chunks
  num_cores = os.cpu_count()
  num_rows = df.shape[0]
  chunk_size = num_rows // num_cores
  chunks = []

  for i in range(0, num_rows, chunk_size):
    chunk = df.iloc[i:i+chunk_size]
    chunks.append(chunk)

  # If there are remaining rows, add them to the last chunk
  if num_rows % chunk_size != 0:
      last_chunk = chunks.pop()  # Remove the last chunk
      chunks[-1] = pd.concat([chunks[-1], last_chunk])

  df_chunks = pd.DataFrame(chunks)
  
  # count occurrances of words in each chunk
  counters = df_chunks.parallel_apply(process_chunk, axis=1)
  display(counters)
  display(counters.shape)
  #concatenate the counters into one counter
  counter = counters[0]
  for i, current_counter in enumerate(counters):
    if i == 0:
      continue
    counter.update(current_counter)
  
  # Sort the data by frequency
  counter = counter.most_common()

  for index, word_freq in enumerate(counter):
    print(str(word_freq).center(20), end=" ")
    if ((index+1)%5 == 0):
      print()

    # get only the top 100
    if (index > 3):
      break
  
  # Convert Counter to DataFrame
  counter_df = pd.DataFrame(list(counter), columns=['Word', 'Count'])

  # Save DataFrame to CSV file
  counter_df.to_csv(f'./export_freq/{name}.csv', index=False)

  

In [107]:
news_path = "../ML-data/news_cleaned_2018_02_13.csv"

# is_reliable, skip_rows, name, with_stop
#plan = [(True, 8_000_000, "reliable_no_stop_freq_1M_9", False)]
plan = [(True, 1_000_000, "reliable_test", False)]
#plan = [(True, 0, "reliable_with_stop_freq_1M_1", True),
#        (False, 0, "fake_with_stop_freq_1M_1", True),
#        (True, 0, "reliable_no_stop_freq_1M_1", False),
#        (False, 0, "fake_no_stop_freq_1M_1", False)]

#for i in range(1, 10):
#  last_value = plan[-1][1]
#  new_value = (i * 1_000_000)
#  plan.append((True, new_value, f"reliable_with_stop_freq_1M_{i+1}", True))
#  plan.append((False, new_value, f"fake_with_stop_freq_1M_{i+1}", True))
#  plan.append((True, new_value, f"reliable_no_stop_freq_1M_{i+1}", False))
#  plan.append((False, new_value, f"fake_no_stop_freq_1M_{i+1}", False))

last_skip = -1
for i, plan_tuple in enumerate(plan):
  if last_skip != plan_tuple[1]:
    df_news = pd.read_csv(news_path, sep=',', skiprows=range(1, plan_tuple[1]), nrows=1_000_000)
  else:
    display(f"i: {i}, skipped read_csv")
        
  last_skip = plan_tuple[1]    

  view = get_view(df_news, plan_tuple[0])
  display(view.head())
  #if plan_tuple[3]:
  #  df_tokenized = view['content'].parallel_apply(process_text_keep_stop, nlp=nlp)
  #else:
  #  df_tokenized = view['content'].parallel_apply(process_text, nlp=nlp)

  run_and_write_frequency_faster(plan_tuple[0], view, f"{plan_tuple[2]}_rows_{view.shape[0]}", plan_tuple[3])

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [1]:
news_path = "../ML-data/news_cleaned_2018_02_13.csv"

df_news = pd.read_csv(news_path, sep=',', skiprows=range(1, 1), nrows=1_000_000)

#view = get_view(df_news, plan_tuple[0])
#display(view.head())

df_fake_news_view = df_news[df_news["type"] == "fake"]
df_reliable_news_view = df_news[df_news["type"] == "reliable"]

display(df_fake_news_view.shape)
display(df_fake_news_view.head())

display(df_reliable_news_view.shape)
display(df_reliable_news_view.head())

NameError: name 'pd' is not defined

In [4]:
news_path = "../ML-data/news_cleaned_2018_02_13.csv"

df_news = pd.read_csv(news_path, sep=',', skiprows=range(1, 8_000_000), nrows=1_000_000)

#view = get_view(df_news, plan_tuple[0])
#display(view.head())

df_fake_news_view = df_news[df_news["type"] == "fake"]
df_reliable_news_view = df_news[df_news["type"] == "reliable"]

display(df_fake_news_view.shape)
display(df_fake_news_view.head())

display(df_reliable_news_view.shape)
display(df_reliable_news_view.head())

(0, 17)

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source


(529108, 17)

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,9848,9348910,nytimes.com,reliable,https://query.nytimes.com/gst/fullpage.html?re...,"FRIEDMAN--Sol, 99. Adored husband of the late ...",2018-02-11 00:44:32.784540,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,"Paid Notice: Deaths FRIEDMAN, SOL",,,['FRIEDMAN SOL'],"FRIEDMAN--Sol, 99. Adored husband of the late ...",,,nytimes
1,9849,9348911,nytimes.com,reliable,https://query.nytimes.com/gst/fullpage.html?re...,"LEVINE--Arnold, on August 2nd, 2010 at 87, sur...",2018-02-11 00:44:32.784599,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,"Paid Notice: Deaths LEVINE, ARNOLD",,,['LEVINE ARNOLD'],"LEVINE--Arnold, on August 2nd, 2010 at 87, sur...",,,nytimes
2,9850,9348912,nytimes.com,reliable,https://query.nytimes.com/gst/fullpage.html?re...,"LEVINSON--Harry, 74, of Monsey, NY. Renowned r...",2018-02-11 00:44:32.784679,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,"Paid Notice: Deaths LEVINSON, HARRY",,,['LEVINSON HARRY'],"LEVINSON--Harry, 74, of Monsey, NY. Renowned r...",,,nytimes
3,9851,9348913,nytimes.com,reliable,https://query.nytimes.com/gst/fullpage.html?re...,"BERGMAN--Irene W. Died August 2, 2010 peaceful...",2018-02-11 00:44:32.784728,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,"Paid Notice: Deaths BERGMAN, IRENE W",,,['BERGMAN IRENE W'],"BERGMAN--Irene W. Died August 2, 2010 peaceful...",,,nytimes
4,9852,9348914,nytimes.com,reliable,https://query.nytimes.com/gst/fullpage.html?re...,9 P.M. (ABC) SHAQ VSShaquille O'Neal tries onc...,2018-02-11 00:44:32.794130,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,What's On Today,,,[''],9 P.M. (ABC) SHAQ VSShaquille O'Neal tries onc...,,,nytimes


In [4]:
news_path = "../ML-data/news_cleaned_2018_02_13.csv"

df_news = pd.read_csv(news_path, sep=',', skiprows=range(1, 7_000_000), nrows=1_000_000)

#view = get_view(df_news, plan_tuple[0])
#display(view.head())

df_fake_news_view = df_news[df_news["type"] == "fake"]
df_reliable_news_view = df_news[df_news["type"] == "reliable"]

display(df_fake_news_view.shape)
display(df_fake_news_view.head())

display(df_reliable_news_view.shape)
display(df_reliable_news_view.head())

  exec(code_obj, self.user_global_ns, self.user_ns)


(0, 17)

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source


(1000000, 17)

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,9848,8348907,nytimes.com,reliable,https://www.nytimes.com/2000/02/24/opinion/l-p...,To the Editor:\n\nThe rags-to-riches myth decr...,2018-02-11 00:34:53.230738,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Poverty and Rising Tides,,,"['WEINSTEIN MICHAEL M', 'REAGAN RONALD WILSO...",Leonard M Greene letter disputes Michael M Wei...,,,nytimes
1,9849,8348908,nytimes.com,reliable,https://www.nytimes.com/2000/02/24/us/the-2000...,Mr. McCain's maverick message of campaign fina...,2018-02-11 00:34:53.230864,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Only Republicans Can Help McCain Win in Califo...,Todd S. Purdum,,"['CALIFORNIA', 'MCCAIN JOHN', 'BUSH GEORGE W...",Sen John McCain's improbable quest has been to...,,,nytimes
2,9850,8348909,nytimes.com,reliable,https://www.nytimes.com/2000/02/24/garden/desi...,''We wanted to materialize the shadows and lig...,2018-02-11 00:34:53.230946,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,"Little by Little, a California Dream Materializes",Joseph Giovannini,,"['VENICE (CALIF)', 'FONTIVEROS JOSE', 'BOCTOR...","Marianna Boctor and Jose Fontiveros, architect...",,,nytimes
3,9851,8348910,nytimes.com,reliable,https://www.nytimes.com/2000/02/24/garden/reta...,"CB2's 6,000 square feet stand on the north end...",2018-02-11 00:34:53.231092,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Retailers Are Rushing To Feather Gen Nest,Rick Marin,,"['TARGET CORP', 'CRATE & BARREL', 'WILLIAMS-SO...","Upscale home furnishings retailers, such as Cr...",,,nytimes
4,9852,8348911,nytimes.com,reliable,https://www.nytimes.com/2000/02/24/business/wo...,European Union regulators may delay a ruling f...,2018-02-11 00:34:53.231164,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,POSSIBLE DELAY ON MERGER,Alan Cowell,,"['EUROPEAN UNION', 'VODAFONE AIRTOUCH PLC', 'M...",European Union regulators may delay for up to ...,,,nytimes


In [5]:
df_news.to_csv(f'./export/news_mostly_reliable_1M_skip_7M.csv', index=False)

In [6]:
rel = "../ML-code/export/news_mostly_reliable_1M_skip_7M.csv"

df_news = pd.read_csv(rel, sep=',')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
display(df_news.shape)
display(df_news)

(1000000, 17)

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,9848,8348907,nytimes.com,reliable,https://www.nytimes.com/2000/02/24/opinion/l-p...,To the Editor:\n\nThe rags-to-riches myth decr...,2018-02-11 00:34:53.230738,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Poverty and Rising Tides,,,"['WEINSTEIN MICHAEL M', 'REAGAN RONALD WILSO...",Leonard M Greene letter disputes Michael M Wei...,,,nytimes
1,9849,8348908,nytimes.com,reliable,https://www.nytimes.com/2000/02/24/us/the-2000...,Mr. McCain's maverick message of campaign fina...,2018-02-11 00:34:53.230864,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Only Republicans Can Help McCain Win in Califo...,Todd S. Purdum,,"['CALIFORNIA', 'MCCAIN JOHN', 'BUSH GEORGE W...",Sen John McCain's improbable quest has been to...,,,nytimes
2,9850,8348909,nytimes.com,reliable,https://www.nytimes.com/2000/02/24/garden/desi...,''We wanted to materialize the shadows and lig...,2018-02-11 00:34:53.230946,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,"Little by Little, a California Dream Materializes",Joseph Giovannini,,"['VENICE (CALIF)', 'FONTIVEROS JOSE', 'BOCTOR...","Marianna Boctor and Jose Fontiveros, architect...",,,nytimes
3,9851,8348910,nytimes.com,reliable,https://www.nytimes.com/2000/02/24/garden/reta...,"CB2's 6,000 square feet stand on the north end...",2018-02-11 00:34:53.231092,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Retailers Are Rushing To Feather Gen Nest,Rick Marin,,"['TARGET CORP', 'CRATE & BARREL', 'WILLIAMS-SO...","Upscale home furnishings retailers, such as Cr...",,,nytimes
4,9852,8348911,nytimes.com,reliable,https://www.nytimes.com/2000/02/24/business/wo...,European Union regulators may delay a ruling f...,2018-02-11 00:34:53.231164,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,POSSIBLE DELAY ON MERGER,Alan Cowell,,"['EUROPEAN UNION', 'VODAFONE AIRTOUCH PLC', 'M...",European Union regulators may delay for up to ...,,,nytimes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,9843,9348905,nytimes.com,reliable,https://www.nytimes.com/2010/08/03/world/middl...,"Photo\n\nAQABA, Jordan — A mysterious rocket t...",2018-02-11 00:44:32.784102,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Rocket Hits Resort on Border of Jordan and Israel,"Stephen Farrell, Isabel Kershner",,"['Israel', 'Jordan', 'Rockets and Rocket Propu...",A rocket that was likely meant for Israel also...,,,nytimes
999996,9844,9348906,nytimes.com,reliable,https://www.nytimes.com/2010/08/03/world/asia/...,"SEOUL, South Korea — After meeting Monday with...",2018-02-11 00:44:32.784174,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,U.S. Envoy Holds Talks on North Korea Sanctions,Choe Sang-Hun,,"['North Korea', 'Nuclear Weapons']","In Seoul on Monday, Robert Einhorn vowed to ra...",,,nytimes
999997,9845,9348907,nytimes.com,reliable,https://query.nytimes.com/gst/fullpage.html?re...,CRANE--Sarah. The Board of Directors of Planne...,2018-02-11 00:44:32.784230,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,"Paid Notice: Deaths CRANE, SARAH",,,['CRANE SARAH'],CRANE--Sarah. The Board of Directors of Planne...,,,nytimes
999998,9846,9348908,nytimes.com,reliable,https://query.nytimes.com/gst/fullpage.html?re...,Although July fell just short of being the hot...,2018-02-11 00:44:32.784321,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,"July Missed Record for Heat, But Set One for P...",,,"['Weather', 'Electric Light and Power', 'Mcgee...",Although July fell just short of being the hot...,,,nytimes


In [119]:
results = run_and_write_frequency_faster(True, view['content'], f"{plan_tuple[2]}_rows_{view.shape[0]}", False)

'Counting True...'

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

content    {'BERGMAN': 7, 'Irene': 417, 'W.': 4291, 'die'...
content    {'LEVINSON': 20, 'Harry': 2091, '74': 814, 'Mo...
content    {'GWATHMEY': 2, 'Charles': 4410, 'hear': 13578...
content    {'FRIEDMAN': 68, 'Sol': 232, '99': 1242, 'ador...
dtype: object

(4,)

 ('\n\n', 5342670)     ('say', 1607264)     ('Mr.', 1188301)     ('New', 707928)      ('year', 641847)   


In [62]:
display(type(results))
#display(results.shape)
#display(results.head(1))

for index, value in enumerate(results):
  print("Index:", index)
  print("Value:", value)

  counter_a = value[0]
  counter_a = counter_a.most_common()
  display(counter_a)
  

list

Index: 0
Value: Counter()


AttributeError: 'int' object has no attribute 'most_common'

In [None]:
df = pd.read_csv("./export/fake_counts_3M_2.csv")
display(df.shape)
df_mod = pd.read_csv("./export/fake_counts_3_mod.csv")
display(df_mod.shape)
replace_url_column(df, df_mod, "fake_counts_part_3_3M_fixed")



(327869, 3)

(327869, 1)

"Wrote <class 'type'> counts to fake_counts_part_3_3M_fixed"

In [None]:
news_path = "../ML-data/news_cleaned_2018_02_13.csv"

#df_news = pd.read_csv(news_path)

#1
df_news = pd.read_csv(news_path, sep=',', nrows=1_000_000)
display(df_news.head(1))
display(df_news.shape)

fake_view = get_view(df_news)

run_and_write_all_counts("fake", fake_view, "_1_mod_test")


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,0,2,express.co.uk,rumor,https://www.express.co.uk/news/science/738402/...,"Life is an illusion, at least on a quantum lev...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Is life an ILLUSION? Researchers prove 'realit...,Sean Martin,,[''],THE UNIVERSE ceases to exist when we are not l...,,,


(2000000, 17)

Unnamed: 0,id,type,content
1611037,791180,fake,Market Finally Breaks Out\n\n% of readers thin...


(104591, 3)

(12927, 3)

(13058, 3)

'reliable:'

(5275, 2)

'fake:'

(99316, 2)

'Counting fake...'

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=24829), Label(value='0 / 24829')))…

Unnamed: 0,url_count
1611037,1
1624635,1
1644701,2
25200,1
437550,0


'Wrote fake counts to fake_counts'

In [None]:
#news_path = "./data/fake-news/test.csv"
#news_path = "../ML-data/news_10_percent_utf8.csv"
news_path = "../ML-data/news_cleaned_2018_02_13.csv"

#df_news = pd.read_csv(news_path)

#1
df_news = pd.read_csv(news_path, sep=',', nrows=1_000_000)
display(df_news.head(1))
display(df_news.shape)

fake_view = get_view(df_news)

run_and_write_all_counts("fake", fake_view, "_1_mod")

#2
df_news = pd.read_csv(news_path, sep=',', skiprows=range(1, 1_000_000), nrows=3_000_000)
display(df_news.head(1))
display(df_news.shape)

fake_view = get_view(df_news)

run_and_write_all_counts("fake", fake_view, "_2_mod")

#3
df_news = pd.read_csv(news_path, sep=',', skiprows=range(1, 4_000_000), nrows=3_000_000)
display(df_news.head(1))
display(df_news.shape)

fake_view = get_view(df_news)

run_and_write_all_counts("fake", fake_view, "_3_mod")



  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,0,2,express.co.uk,rumor,https://www.express.co.uk/news/science/738402/...,"Life is an illusion, at least on a quantum lev...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Is life an ILLUSION? Researchers prove 'realit...,Sean Martin,,[''],THE UNIVERSE ceases to exist when we are not l...,,,


(1000000, 17)

Unnamed: 0,id,type,content
670747,602279,fake,PETRUS ROMANUS=Pope Francis (Installed March 1...


(64035, 3)

(7915, 3)

(7995, 3)

'reliable:'

(2114, 2)

'fake:'

(61921, 2)

'Counting fake...'

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15481), Label(value='0 / 15481')))…

Unnamed: 0,url_count
670747,0
434204,0
75250,2
72823,1
436766,0


'Wrote fake counts to fake_counts'

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,9963,1170150,express.co.uk,rumor,https://www.express.co.uk/life-style/health/74...,GETTY Tension headache: Grinding teeth can cau...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Headache CURE: Tackling THIS common night time...,Olivia Lerche,,[''],TENSION headaches are the most common type of ...,,,


(3000000, 17)

Unnamed: 0,id,type,content
1172781,1910632,fake,"(MERU, CRWE, ATML, QADA) Stocks in Focus by Pe..."


(94791, 3)

(11716, 3)

(11835, 3)

'reliable:'

(7507, 2)

'fake:'

(87284, 2)

'Counting fake...'

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=21821), Label(value='0 / 21821')))…

Unnamed: 0,url_count
1172781,1
1963730,0
453917,0
2400762,2
1021906,1


'Wrote fake counts to fake_counts'

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,9881,4617588,themuslimissue.wordpress.com,hate,https://themuslimissue.wordpress.com/2013/06/1...,The satanic savages of Islam have no humanity ...,2017-11-27T01:15:32.269834,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,(Video) Muslim education: beat children and br...,,,[''],The satanic savages of Islam have no humanity ...,,,


(3000000, 17)

Unnamed: 0,id,type,content
943174,5749823,fake,SPLC Offers Their Own Hate Speech Directed at ...


(332042, 3)

(41040, 3)

(41454, 3)

'reliable:'

(4173, 2)

'fake:'

(327869, 2)

'Counting fake...'

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=81968), Label(value='0 / 81968')))…

Unnamed: 0,url_count
943174,0
1668537,0
1839478,0
152114,0
2304984,0


'Wrote fake counts to fake_counts'

In [None]:
import spacy
#spacy.require_gpu()
nlp = spacy.load('en_core_web_sm', exclude=["parser","ner"])


In [None]:

def do_thing(text, nlp):
  tokens = nlp(text)
  tokens = [token.lemma_ for token in tokens if not token.is_stop]
  return tokens

#from pandarallel import pandarallel
#pandarallel.initialize(progress_bar=True)


In [None]:
%%time
df_output = df_trimmed['content'].apply(do_thing, nlp=nlp)

display(df_output) 

0      [life, illusion, ,, quantum, level, ,, theory,...
1      [unfortunately, ,, attack, islamic, terrorism,...
2      [Los, Angeles, Police, Department, deny, $, 3,...
3      [White, House, decide, quietly, withdraw, tie,...
4      [", time, come, cut, tongue, support, peace, m...
                             ...                        
995    [shocking, ,, unprecedented, Christmas, day, n...
996    [Thrilling, Thursday, –, Nasdaq, 7,000, Summit...
997    [Obama, face, environmental, lawsuit, Chicago,...
998    [Fake, News, Satanic, say, Pope, \n\n, %, read...
999    [Metalla, Royalty, Streaming, increase, produc...
Name: content, Length: 1000, dtype: object

Wall time: 29.6 s


In [None]:

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import multiprocessing as mp
# Define your function that operates on each chunk
def process_chunk(chunk):
    #import spacy
    #nlp = spacy.load('en_core_web_sm', exclude=["tagger","parser","ner", "attribute_ruler"])

    #from pandarallel import pandarallel
    #pandarallel.initialize(nb_workers=1, progress_bar=True)
    
    #def do_thing(text, nlp):
    #    tokens = nlp(text)
    #    tokens = [token.lemma_ for token in tokens if not token.is_stop]
    #    return tokens
    
    #chunk = chunk['content'].apply(do_thing, nlp=nlp)
    return chunk

# Define your worker function
def worker(chunk, func, output_queue):
    result = chunk.apply(func)
    output_queue.put(result)

# Define your main function to split DataFrame into chunks and apply parallel processing
def parallel_process_df(df, func):
    # Get the number of CPU cores
    num_cores = mp.cpu_count()
    display('num_cores: '+str(num_cores))
    
    # Calculate the chunk size
    chunk_size = int(len(df) / num_cores) + 1
    
    # Split DataFrame into chunks
    chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]
    
    # Create a queue to store results
    #output_queue = mp.Queue()
    
    mgr = mp.Manager()
    ns_list = []
    for chunk in chunks:
        ns = mgr.Namespace()
        ns.df = chunk
        ns_list.append(ns)

    dff = pd.DataFrame(np.random.rand(101, 5), columns=list('abcde'))
    display(dff.shape)

    def processor(dff):
        return dff
    
    split_arr = np.array_split(dff, mp.cpu_count())
    display(split_arr)

    # process data
    pool = mp.Pool(processes = mp.cpu_count())
    results = pool.map(processor, np.array_split(dff, mp.cpu_count()))

    #with mp.Pool() as pool:
    #    results = pool.map(func, ns_list)
    
    # Get results from the queue
    #results = [output_queue.get() for _ in processes]
    
    # Concatenate processed chunks back into a single DataFrame
    result_df = pd.concat(results)
    
    return result_df



# Apply parallel processing on DataFrame
result_df = parallel_process_df(df_trimmed, process_chunk)

print(result_df.head())

In [None]:
#import spacy
#nlp = spacy.load('en_core_web_sm', exclude=["tagger" ,"parser", "ner", "attribute_ruler"])
pipeline_components = nlp.pipeline

# Display the pipeline components
print("Pipeline components:")
for component in pipeline_components:
    print(component[0])

Pipeline components:
tok2vec
lemmatizer
