#### **This notebook gets the userid and tweet time of all replies**
#### **that have greater 5 or more replies from IO accounts**

In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
import json
import sys
import os

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as vz
import helper.helper as hp
import helper.file_helper as file_hp
import config.config as config_hp
import helper.pandas_helper as pd_hp
import helper.twitter_helper as twitter_hp

##### **Read config files**

In [53]:
config = config_hp.config()
path = config['PATHS']

derived_path = path['derived_path']
all_tweet_data = path['all_tweet_data']
plot_path = path['plot_path']
external_reply = path['external_reply']
conversation_ids_5 = path['conversation_ids_5']

##### **Read files**

In [3]:
all_campaigns, names = st.bundle_campaign()

In [4]:
df_replies = pd.read_pickle(external_reply)

In [5]:
df_replies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21427154 entries, 0 to 21427153
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   replier_tweetid  int64  
 1   replier_userid   object 
 2   poster_tweetid   float64
 3   poster_userid    float64
 4   tweet_language   object 
 5   tweet_time       object 
 6   year             object 
 7   campaign         object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.3+ GB


In [6]:
df_replies = df_replies.astype({
    'poster_tweetid': int,
    'poster_userid': int
})

In [7]:
df_replies['poster_tweetid'].nunique()

15256547

In [8]:
all_5_conversation = file_hp.read_file(conversation_ids_5)

all_5_conversation = [int(i) for i in all_5_conversation]

In [9]:
print('Type of type data: ', type(all_5_conversation))
print('Total conversations :' , len(all_5_conversation))
print('Type of element :', type(all_5_conversation[0]))

Type of type data:  <class 'list'>
Total conversations : 96041
Type of element : <class 'int'>


##### **Getting all the tweets with 5 or more replies**

In [10]:
df_conv = df_replies.loc[
    df_replies['poster_tweetid'].isin(all_5_conversation)][['poster_tweetid', 'poster_userid', 'tweet_time']]

In [11]:
df_conv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1507927 entries, 973 to 21426755
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   poster_tweetid  1507927 non-null  int64 
 1   poster_userid   1507927 non-null  int64 
 2   tweet_time      1507927 non-null  object
dtypes: int64(2), object(1)
memory usage: 46.0+ MB


In [42]:
df_cov = df_conv.sort_values(by=['tweet_time'])

df_cov['tweet_time'] = pd.to_datetime(df_cov['tweet_time'])

In [43]:
df_first_last = pd_hp.read_first_last_row_of_grp(df_conv,
                                                 'poster_tweetid',
                                                 'tweet_time'
                                                )

In [45]:
df_first_last.head()

Unnamed: 0,poster_tweetid,poster_userid,tweet_time
0,175649582045347840,131812518,2012-03-02 18:40
1,175649582045347840,131812518,2012-03-02 18:44
2,176283333385396224,200524435,2012-03-04 13:29
3,176283333385396224,200524435,2012-03-04 13:59
4,177373540574695424,131812518,2012-03-07 12:48


In [47]:
df_first_last['last_reply'] = pd.to_datetime(
    df_first_last['tweet_time']) + pd.Timedelta(24, unit='h') 

In [48]:
df_first_last = df_first_last.sort_values(by=['tweet_time'])

df_first_last.head()

Unnamed: 0,poster_tweetid,poster_userid,tweet_time,last_reply
0,175649582045347840,131812518,2012-03-02 18:40,2012-03-03 18:40:00
1,175649582045347840,131812518,2012-03-02 18:44,2012-03-03 18:44:00
2,176283333385396224,200524435,2012-03-04 13:29,2012-03-05 13:29:00
3,176283333385396224,200524435,2012-03-04 13:59,2012-03-05 13:59:00
4,177373540574695424,131812518,2012-03-07 12:48,2012-03-08 12:48:00


In [49]:
df_grps = df_first_last.groupby('poster_tweetid').tail(1)

In [50]:
df_grps.head()

Unnamed: 0,poster_tweetid,poster_userid,tweet_time,last_reply
1,175649582045347840,131812518,2012-03-02 18:44,2012-03-03 18:44:00
3,176283333385396224,200524435,2012-03-04 13:59,2012-03-05 13:59:00
5,177373540574695424,131812518,2012-03-07 13:19,2012-03-08 13:19:00
7,191443676885565440,131812518,2012-04-15 08:38,2012-04-16 08:38:00
9,230276901355286528,200524435,2012-07-31 12:44,2012-08-01 12:44:00


##### **Save userid and tweet time**

In [51]:
file_hp.create_folder(derived_path, 'posters')
user_path = os.path.join(derived_path, 'posters')

In [52]:
user_path

'/N/slate/potem/data/derived/posters'

In [53]:
splited_reply_ids_path = path['splited_reply_ids']
splited_reply_ids_path = os.path.join(splited_reply_ids_path, 
                                      '*.txt')

In [57]:
for file in glob.glob(splited_reply_ids_path):
    parts = file.split('/')
    index = parts[-1].split('_')[1]
    
    # if int(index) > 60:
    #     continue
        
    ids = file_hp.read_file(file)

    ids = [int(i) for i in ids]
    
    tweets = df_grps.loc[df_grps['poster_tweetid'].isin(ids)]
    
    rows = tweets['poster_userid']
    start_time = tweets['tweet_time']
    end_time = tweets['last_reply']
    
    file_hp.write_to_file_row_each_line(user_path,
                                        'posters_' + parts[-1],
                                        rows
                                       )
    file_hp.write_to_file_row_each_line(user_path,
                                        'start_time_' + parts[-1],
                                        start_time
                                       )
    
    file_hp.write_to_file_row_each_line(user_path,
                                         'end_time_' + parts[-1],
                                        end_time
                                       )

In [56]:
# file_hp.read_file(user_path + '/job_9_400_450.txt')

##### **Get tweets from poster ids after reply (within in window of 24 hours)**

In [88]:
user_path = os.path.join(derived_path, 'posters')
poster_path = os.path.join(user_path, 'posters_*')

In [84]:
poster_tweet_path = file_hp.create_folder(derived_path, 
                                          'posters_tweets')

In [83]:
import imp

imp.reload(file_hp)

  import imp


<module 'helper.file_helper' from '/geode2/home/u070/potem/Quartz/project/infoOps-strategy/package/helper/file_helper.py'>

In [120]:
for id_file in glob.glob(poster_path):
    parts = id_file.split(os.sep)
    index = parts[-1].split('_')[2]
    filename = parts[-1].split('.')[0]
    
    if int(index) > 60:
        continue
        
    start_time = filename.replace('posters', 'start_time') + '.txt'
    end_time = filename.replace('posters', 'end_time') + '.txt'
    campaign_json = f'{filename}.jsonl'
    
    path_to_json = os.path.join(poster_tweet_path, campaign_json)
    start_time_file = os.path.join(user_path, start_time)
    end_time_file =  os.path.join(user_path, end_time)
    
    start_file = file_hp.read_file(start_time_file)
    end_file = file_hp.read_file(end_time_file)
    posters = file_hp.read_file(id_file)
    
    for i, user in enumerate(posters):
        job_info = filename.replace('posters_', '')
        poster_filename = f'{job_info}_{user}.jsonl'
        path = os.path.join(poster_tweet_path, poster_filename)
        
        start_file[i] = pd.to_datetime(start_file[i]) + pd.Timedelta(0, unit='s')
        end_file[i] = pd.to_datetime(end_file[i]) + pd.Timedelta(0, unit='s')
        
        start_file[i] = start_file[i].isoformat('T')
        end_file[i] = end_file[i].isoformat('T')
        
        command = f'twarc2 timeline --start-time={start_file[i]} ' \
        f' --sort-order=relevancy --use-search --exclude-retweets --exclude-replies ' \
        f'{user} > {path}'
        
        print(command)
        
        os.system(command)  
        
        break

    break
        

twarc2 timeline --start-time=2015-12-17T18:57:00 --end-time=2015-12-18T18:57:00  --use-search --exclude-retweets --exclude-replies 1933612350 > /N/slate/potem/data/derived/posters_tweets/job_34_1650_1700_1933612350.jsonl


100%|██████████| Processed a day/a day [14:32<00:00, 2 tweets total ]    


##### **Get profile metadata**

In [29]:
df_replies = pd.read_pickle(external_reply)

In [30]:
df_posters = df_replies.loc[df_replies['poster_tweetid'].isin(all_5_conversation)]

In [31]:
df_posters = df_posters.astype({
    'poster_userid': int,
})

In [32]:
poster_ids = list(df_posters['poster_userid'].unique())

print('Total posters :', len(poster_ids))

Total posters : 15016


In [33]:
poster = config['POSTER_PATH']

poster_path = poster['poster_info_path']

In [34]:
file_hp.write_to_file_row_each_line(poster_path, 
                                    'poster_userid.txt',
                                    poster_ids, 
                                    )

In [36]:
poster_path = config['POSTER_PATH']
poster_id_path = poster_path['poster_id_path']
poster_json_path = poster_path['poster_json_path']

In [37]:
def get_profile_info(profile_id_file, 
                     profile_json_file):
    '''
    Gets the user meta data
    :param profile_id_file: Twitter account userid file
    :param profile_json_file: File path with file name for
    json of profiles
    '''
    command = f'twarc2 users {profile_id_file} {profile_json_file}'
    print(command)
    os.system(command)  
        
        
get_profile_info(poster_id_path, poster_json_path)

twarc2 users /N/slate/potem/data/derived/posters_info/poster_userid.txt /N/slate/potem/data/derived/posters/poster_info.jsonl


##### **Parse the poster info**

In [83]:
poster_path = config['POSTER_PATH']

poster_json_path = poster_path['poster_json_path']
poster_id_path = poster_path['poster_id_path']
poster_info_path = poster_path['poster_info_path']

In [91]:
def get_empty_profile_dict():
    '''
    Creates an empty variable to store profile information
    '''
    profiles = {
        'created_at': None,
        'verified': None,
        'description': None,
        'protected': None,
        'username': None,
        'id': None,
        'proile_image_url': None,
        'pinned_tweet_id': None,
        'name': None,
        'public_metrics': None,
        'followers_count': None,
        'following_count': None,
        'tweet_count': None,
        'listed_count': None
    }
    
    return profiles


def set_values_in_profile_dict(values):
    '''
    Sets variable values for profiles
    :param dic_variable: empty dictionary variable
    :param values: values from which values are to be stored
    
    :return dictionary
    '''
    profile = get_empty_profile_dict()
    profile['created_at']=values['created_at']
    profile['verified']=values['verified']
    profile['description']=values['description']
    profile['protected']=values['protected']
    profile['username']=values['username']
    profile['id']=values['id']
    profile['profile_image_url']=values['profile_image_url']

    if 'pinned_tweet_id' in values:
        profile['pinned_tweet_id']=values['pinned_tweet_id']

    profile['name']=values['name']

    if 'public_metrics' in values:
        public_metrics = values['public_metrics']
        profile['followers_count']=public_metrics['followers_count']
        profile['following_count']=public_metrics['following_count']
        profile['tweet_count']=public_metrics['tweet_count']
        profile['listed_count']=public_metrics['listed_count']
        
    return profile



def set_values_for_profile_error(values):
    '''
    Sets variable values for profiles which are not available
    :param values: values to be stored
    
    :return dictionary
    '''
    if values['resource_type'] != 'user':
        return None
    
    profile = get_empty_profile_dict()
    
    if 'suspended' in values['detail']:
        profile['verified'] = 'suspended'
    if 'not find user' in values['detail']:
        profile['verified'] = 'not found'
        
    profile['description'] = values['detail']
    profile['id'] = values['value']
    
    return profile



def parse_profile_json(profile_file, 
                       output_file=None) -> pd.DataFrame:
    '''
    Parse the profile json file
    
    :param profile_file: location (along with file name)
    to the profile json file
    :param output_file: location (along with file name) to save
    the parsed file
    
    :return DataFrame
    '''
    
    all_profiles = []
    
    with open(profile_file, 'r') as json_file:
        for row in json_file:
            one_row = json.loads(row)
            # print(one_row.keys())
            # print(one_row['errors'])
            
            if 'errors' in one_row:
                for single_profile in one_row['errors']:
                    profile = set_values_for_profile_error(single_profile)
                    
                    if profile != None:
                        all_profiles.append(profile)

            if 'data' not in one_row:
                continue
                
            for single_profile in one_row['data']:
                profile = set_values_in_profile_dict(single_profile)
                
                if profile != None:
                    all_profiles.append(profile)
                
    
    df = pd.DataFrame.from_records(data=all_profiles)
    
    if output_file is not None:
        df.to_pickle(output_file)
        
    return df


poster_info_file = os.path.join(poster_info_path, 
                                'profile_info.pkl.gz')
df_profiles = parse_profile_json(poster_json_path, 
                                 poster_info_file)

print('Original number of posters: ', 
      len(file_hp.read_file(poster_id_path)))

Original number of posters:  15016


In [94]:
df_not_found = df_profiles.loc[df_profiles['verified'] == 'not found']
df_suspended = df_profiles.loc[df_profiles['verified'] == 'suspended']

In [95]:
print('Total not found accounts, : ', len(df_not_found))
print('Total suspended accounts, : ', len(df_suspended))

Total not found accounts, :  3992
Total suspended accounts, :  5041


In [96]:
poster_info_file

'/N/slate/potem/data/derived/posters_info/profile_info.pkl.gz'

##### **Remove dead posters from control files**

In [7]:
config = config_hp.config()
poster_control = config['POSTER_CONTROL']
poster_control_path = poster_control['poster_control']

poster = config['POSTER_PATH']
poster_dead_tweet_file = poster['poster_dead_tweet_file']

In [53]:
def remove_dead_poster(path, org_rows):
    '''
    Remove rows from multiple file
    :param path: path to files with filename wild cards
    :param org_rows: original row list without unwanted rows
    '''
    for file in glob.glob(path):
        print(file)
        
        start_time_file_name = file.replace('posters_','start_time_')
        start_time_file = file_hp.read_file(start_time_file_name)
        
        ids = file_hp.read_file(file)
        
        if len(ids) <= len(start_time_file):
            continue
            
        for i, row in enumerate(ids):
            if row not in org_rows:
                ids.remove(str(row))
                start_time_file.pop(i)
                
        temp_file = open(f'{file}', 'r+')
        
        temp_file.truncate(0)
        
        file_hp.write_to_file_row_each_line(file, 
                                    None, 
                                    ids)
        temp_file = open(f'{start_time_file_name}', 'r+')
        
        temp_file.truncate(0)
        
        file_hp.write_to_file_row_each_line(start_time_file_name, 
                                            None, 
                                            ids)
        
# org_rows = file_hp.read_file(poster_dead_tweet_file)

# remove_dead_poster(poster_control_path + os.sep + 'posters*.txt',
#                    org_rows
#                   )

In [55]:
def remove_empty_files(path):
    '''
    Checks the number of ids in txt files
    :param path: Path of files
    '''
    for file in glob.glob(path):
        ids = set(file_hp.read_file(file))
        
        if len(ids) == 0:
            print(file, len(ids))
        
id_path = poster_control_path + os.sep + 'posters*.txt'

remove_empty_files(id_path)

#### **Correcting the time to get the control tweets**

In [1]:
## posters folder has poster ids, start time and end time
## posters_tweets has tweets

In [3]:
import imp

imp.reload(config_hp)

config = config_hp.config()
path =  config['PATHS']
poster = config['POSTER_PATH']
poster_control_ids = config['POSTER_CONTROL']

external_reply = path['external_reply']
conversation_ids_5 = path['conversation_ids_5']
poster_alive_file = poster['poster_alive_file']
poster_alive_with_tweet_count_file = poster['poster_alive_with_tweet_count_file']

In [4]:
df_external_replies = pd.read_pickle(external_reply)

In [5]:
df_external_replies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21427154 entries, 0 to 21427153
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   replier_tweetid  int64  
 1   replier_userid   object 
 2   poster_tweetid   float64
 3   poster_userid    float64
 4   tweet_language   object 
 5   tweet_time       object 
 6   year             object 
 7   campaign         object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.3+ GB


In [6]:
ids = file_hp.read_file(conversation_ids_5)

print('Type : ', type(ids[0]))
print('Number of datapoints :', len(ids))
print('Example id: ', ids[0])

Type :  <class 'str'>
Number of datapoints : 96041
Example id:  64953578015571968


In [7]:
df_external_replies = df_external_replies.astype({
    'poster_tweetid': int,
})
    
df_external_replies = df_external_replies.astype({
    'poster_tweetid': str,
    'poster_userid': int
})

In [8]:
# df_external_replies['poster_tweetid']

In [9]:
df_5 = df_external_replies.loc[
    df_external_replies['poster_tweetid'].isin(ids)]

print(len(df_5))

1507927


In [10]:
df_5['poster_userid'].nunique()

15016

##### **Getting tweets size for each posters**

In [11]:
df_poster_tweets = (df_5.groupby(['poster_userid'])['poster_tweetid']
                   .nunique()
                   .to_frame('count')
                   .reset_index())

In [12]:
df_poster_tweets['count'].min()

1

In [13]:
df_poster_tweets.columns

Index(['poster_userid', 'count'], dtype='object')

##### **Getting last tweet and tweet time for each poster**

In [14]:
df_5 = df_5.sort_values(by=['tweet_time'],
                        ascending=True
                       )

In [15]:
df_poster_last_tweet = (df_5.groupby(['poster_userid'])[['poster_userid',
                                                         'poster_tweetid', 
                                                         'tweet_time']]
                        .tail(1)
                        .reset_index(drop=True))

##### **Keeping just alive posters**

In [16]:
alive_posters = file_hp.read_file(poster_alive_file)

In [17]:
type(alive_posters[0])

str

In [18]:
len(alive_posters)

5983

In [19]:
df_poster_last_tweet = df_poster_last_tweet.astype({
    'poster_userid': str
})
df_poster_tweets = df_poster_tweets.astype({
    'poster_userid': str
})

In [20]:
df_poster_last_tweet = df_poster_last_tweet.loc[
    df_poster_last_tweet['poster_userid'].isin(alive_posters)]

In [21]:
df_poster_tweets = df_poster_tweets.loc[
    df_poster_tweets['poster_userid'].isin(alive_posters)]

In [22]:
print(len(df_poster_last_tweet))
print(len(df_poster_tweets))

5983
5983


##### **Save alive posters**

In [23]:
df_size = df_poster_last_tweet.merge(df_poster_tweets,
                           on='poster_userid')

In [24]:
df_size = df_size.astype({
    # 'poster_tweetid': str,
    'poster_userid': int
})

In [25]:
data_list = df_size[
    ['poster_userid', 'tweet_time', 'count']
].values.tolist()

In [26]:
data_list[0]

[200524435, '2012-07-31 12:57', 2]

In [27]:
len(data_list)

5983

##### **Save three information in file**

In [28]:
file_hp.write_to_file_row_each_line(poster_alive_with_tweet_count_file,
                                    None,
                                    data_list
                                   )

#### **Get poster tweets after removing alive posters**

In [51]:
import imp

imp.reload(config_hp)

config = config_hp.config()
poster = config['POSTER_PATH']
poster_alive_file = poster['poster_alive_file']
poster_alive_with_tweet_count_file = poster['poster_alive_with_tweet_count_file']

In [None]:
def get_poster_tweets_with_count(config):
    '''
    Gets the poster tweets
    :param config: config file to be loaded
    '''
    
    config = config.config()
    poster = config['POSTER_PATH']
    poster_alive_with_tweet_count_file = poster['poster_alive_with_tweet_count_file']
    
    poster_control = config['POSTER_CONTROL']
    poster_control_tweets = poster_control['poster_control_tweets']
    poster_tweet_path = file_hp.create_folder(poster_control_tweets, 
                                              'posters_new_tweets')
    poster_ids = file_hp.read_file(poster_alive_with_tweet_count_file)
    
    for row in poster_ids:
        row = row.strip('][').split(', ')

        user = int(row[0])
        start_time = row[1]
        count = row[2]
        poster_filename = f'control_tweets_{user}.jsonl'
        path = os.path.join(poster_tweet_path, poster_filename)

        start_time = pd.to_datetime(start_time) + pd.Timedelta(0, unit='s')
        start_time = start_time.isoformat('T')

        command = f'twarc2 timeline --start-time={start_time} ' \
        f' --sort-order=relevancy --use-search --exclude-retweets --exclude-replies ' \
        f'--limit {count} {user} > {path}'

        os.system(command)

# 
# get_poster_tweets_with_count(config_hp)

#### **Parse the control tweets from posters**

In [3]:
import imp

imp.reload(config_hp)
imp.reload(twitter_hp)

config = config_hp.config()
poster = config['POSTER_PATH']
poster_control = config['POSTER_CONTROL']

poster_alive_file = poster['poster_alive_file']
poster_alive_with_tweet_count_file = poster['poster_alive_with_tweet_count_file']

poster_control_new_tweets = poster_control['poster_control_new_tweets']
poster_control_info = poster_control['poster_control_info']
# poster_control_conversations = poster_control['poster_control_conversations']
pc_split_conversation_id = poster_control['pc_split_conversation_id']

In [4]:
control_posters = file_hp.read_file(poster_alive_with_tweet_count_file)

In [5]:
all_control_conversations = []
all_sum = 0
track = []
for row in control_posters:
    row = row.strip('][').split(', ')

    user = int(row[0])
    start_time = row[1]
    count = int(row[2])
    
    if count == 0:
        print(user)
        
    poster_filename = f'control_tweets_{user}.jsonl'
    new_path = os.path.join(poster_control_new_tweets, 
                            poster_filename)
    isExist = os.path.exists(new_path)
    
    if isExist == False:
        continue
        
    df = twitter_hp.parse_tweets(new_path)
    
    if len(df) == 0:
        continue
    
    if count <= df['tweetid'].nunique():
        all_sum = all_sum + count
        df['created_at'] = pd.to_datetime(df['created_at'])
        df = df.sort_values(by=['created_at'],
                        ascending=True
                       )
        top_count = df['tweetid'].head(count).tolist()
        
        track.append(user)
        all_control_conversations.extend(top_count)

print(all_sum)
print(len(track))
print(len(all_control_conversations))

6617
3167
6617


In [6]:
file_hp.write_to_file_row_each_line(poster_control_info,
                                    'poster_control_track_1.txt',
                                    track
                                   )
file_hp.write_to_file_row_each_line(poster_control_info,
                                    'poster_control_conversation_1.txt',
                                    all_control_conversations
                                   )

### **Split conversation ids**

In [7]:
conversation_path = os.path.join(poster_control_info,
                                    'poster_control_conversation_1.txt'
                                )

In [11]:
def split_into_files(input_file, 
                     save_path,
                     split_threshold=50,
                     prefix_for_file='job_control_conversations'
                    ):
    '''
    Splits the rows of file into multiple files
    :param input_file: text file which has data
    :param split_threshold: threshold to split the rows by
    :param save_path: path where files are to be saved
    :param prefix_for_file: prefix for new file names
    '''
    
    rows = file_hp.read_file(input_file)
    index = 1
    for i in range(0, len(rows), split_threshold):
        ids_split = rows[i:i+split_threshold]
        last = i + split_threshold

        #first is the index of job
        #second and third are the index of rows
        filename = f'{prefix_for_file}_{index}_{i}_{last}.txt'

        file_hp.write_to_file_row_each_line(save_path,
                                            filename,
                                            ids_split
                                           )

        index = index + 1

In [13]:
split_into_files(conversation_path,
                 pc_split_conversation_id,
                 split_threshold=10
                )

### **Create jobs**

In [14]:
import imp
import helper.slurm_helper as slurm_hp

imp.reload(config_hp)
imp.reload(twitter_hp)

config = config_hp.config()
poster_control = config['POSTER_CONTROL']

pc_split_conversation_id = poster_control['pc_split_conversation_id']
pc_extracted_conversations = poster_control['pc_extracted_conversations']

slurm_config = config['SLURM_PATH']
slurm_path = slurm_config['slurm_path']

In [15]:
slurm_script_path = file_hp.create_folder(slurm_path, 
                                          'control_conversations')

In [17]:
def create_multiple_jobs(multiple_file_path,
                         python_script,
                         path_slurm,
                         destination_folder,
                         logs_path=None
               ):
    '''
    Path of splited conversations
    :param multiple_file_path: path where multiple files are present
    :param python_script: script to be run
    :param path_slurm: path where slurm script to be saved
    :param destination_folder: folder where the data is to be saved
    :param logs_path: path where log file are to be saved
    '''
    new_path = os.path.join(multiple_file_path, f'job_control_*.txt')
    i = 0
    
    if logs_path == None:
        file_hp.create_folder(path_slurm, 'logs')
        logs_path = os.path.join(path_slurm, 'logs')
        
    for file in glob.glob(new_path):
        i = i + 1
        parts = file.split(os.sep)
        filename = (parts[-1]).split('.')[0]
        command = f'python {python_script} --file={file} ' + \
        f'--destination-folder={destination_folder}'
        
        job_name = f'{filename}'
        slurm_scrip_path = slurm_hp.create_slurm_script(job_name, 
                                     command,
                                     path_slurm)
        print(slurm_scrip_path)
        
        
        slurm_hp.despatch_single_job(slurm_scrip_path,
                                     logs_path
                                    )
        
        
# script_path = '/N/u/potem/Quartz/project/infoOps-strategy/script/py_scripts/data_sourcing/get_conversation.py'
# create_multiple_jobs(pc_split_conversation_id, 
#                      script_path,
#                      slurm_script_path,
#                      pc_extracted_conversations
#                     )

In [40]:
def despatch_job(path, logs_path):
    
    for file in glob.glob(path):
        job_index = file.split('/')[-1].split('_')[3]
        # print(file)
        if int(job_index) > 100:
            continue
            
        command = f'sbatch {file}'
        
        os.chdir(logs_path)
        os.system(command)
        print(job_index)

        
slurm_path_og='/N/u/potem/Quartz/sbatch'
slurm_path = os.path.join(slurm_path_og, 
                        'control_conversations')
file_hp.create_folder(slurm_path, 'logs')
logs_path = file_hp.create_folder(slurm_path, 'logs')
new_path = os.path.join(slurm_path, f'*.sh')

## Do not forget to change get_conversation file
# despatch_job(new_path, logs_path)

In [36]:
def cancel_jobs(start_no, end_no):
    '''
    Cancels the jobs
    :param start_no: starting job no
    :param end_no: ending job no
    '''
    print('here')
    for i in range(start_no, end_no+1):
        # id = start_no + i
        print(i)
        command = f'scancel {i}'

        os.system(command)
        
        
# cancel_jobs(1474611, 1474711)

In [41]:
1474611+100

1474711